[Python-checkins] bpo-46932: Update bundled libexpat to 2.4.7 (GH-31736) (GH-31741)

ned-deily webhook-mailer at python.org
Mon Mar 7 18:11:56 EST 2022


https://github.com/python/cpython/commit/f656bc1cdbdfaaa07f66ed97e011b258b97e2788
commit: f656bc1cdbdfaaa07f66ed97e011b258b97e2788
branch: 3.7
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: ned-deily <nad at python.org>
date: 2022-03-07T18:11:09-05:00
summary:

bpo-46932: Update bundled libexpat to 2.4.7 (GH-31736) (GH-31741)

(cherry picked from commit 176835c3d5c70f4c1b152cc2062b549144e37094)

Co-authored-by: Steve Dower <steve.dower at python.org>

files:
A Misc/NEWS.d/next/Library/2022-03-07-20-20-34.bpo-46932.xbarAs.rst
M Modules/expat/expat.h
M Modules/expat/xmlparse.c

diff --git a/Misc/NEWS.d/next/Library/2022-03-07-20-20-34.bpo-46932.xbarAs.rst b/Misc/NEWS.d/next/Library/2022-03-07-20-20-34.bpo-46932.xbarAs.rst
new file mode 100644
index 0000000000000..8545c656eab89
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-07-20-20-34.bpo-46932.xbarAs.rst
@@ -0,0 +1 @@
+Update bundled libexpat to 2.4.7
diff --git a/Modules/expat/expat.h b/Modules/expat/expat.h
index 46a0e1bcd22de..c9214f64070a8 100644
--- a/Modules/expat/expat.h
+++ b/Modules/expat/expat.h
@@ -15,6 +15,7 @@
    Copyright (c) 2016      Cristian Rodríguez <crrodriguez at opensuse.org>
    Copyright (c) 2016      Thomas Beutlich <tc at tbeu.de>
    Copyright (c) 2017      Rhodri James <rhodri at wildebeest.org.uk>
+   Copyright (c) 2022      Thijs Schreijer <thijs at thijsschreijer.nl>
    Licensed under the MIT license:
 
    Permission is  hereby granted,  free of charge,  to any  person obtaining
@@ -174,8 +175,10 @@ struct XML_cp {
 };
 
 /* This is called for an element declaration. See above for
-   description of the model argument. It's the caller's responsibility
-   to free model when finished with it.
+   description of the model argument. It's the user code's responsibility
+   to free model when finished with it. See XML_FreeContentModel.
+   There is no need to free the model from the handler, it can be kept
+   around and freed at a later stage.
 */
 typedef void(XMLCALL *XML_ElementDeclHandler)(void *userData,
                                               const XML_Char *name,
@@ -237,6 +240,17 @@ XML_ParserCreate(const XML_Char *encoding);
    and the local part will be concatenated without any separator.
    It is a programming error to use the separator '\0' with namespace
    triplets (see XML_SetReturnNSTriplet).
+   If a namespace separator is chosen that can be part of a URI or
+   part of an XML name, splitting an expanded name back into its
+   1, 2 or 3 original parts on application level in the element handler
+   may end up vulnerable, so these are advised against;  sane choices for
+   a namespace separator are e.g. '\n' (line feed) and '|' (pipe).
+
+   Note that Expat does not validate namespace URIs (beyond encoding)
+   against RFC 3986 today (and is not required to do so with regard to
+   the XML 1.0 namespaces specification) but it may start doing that
+   in future releases.  Before that, an application using Expat must
+   be ready to receive namespace URIs containing non-URI characters.
 */
 XMLPARSEAPI(XML_Parser)
 XML_ParserCreateNS(const XML_Char *encoding, XML_Char namespaceSeparator);
@@ -317,7 +331,7 @@ typedef void(XMLCALL *XML_StartDoctypeDeclHandler)(void *userData,
                                                    const XML_Char *pubid,
                                                    int has_internal_subset);
 
-/* This is called for the start of the DOCTYPE declaration when the
+/* This is called for the end of the DOCTYPE declaration when the
    closing > is encountered, but after processing any external
    subset.
 */
@@ -1041,7 +1055,7 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold(
 */
 #define XML_MAJOR_VERSION 2
 #define XML_MINOR_VERSION 4
-#define XML_MICRO_VERSION 6
+#define XML_MICRO_VERSION 7
 
 #ifdef __cplusplus
 }
diff --git a/Modules/expat/xmlparse.c b/Modules/expat/xmlparse.c
index 7db28d07acbcd..05216d997b07f 100644
--- a/Modules/expat/xmlparse.c
+++ b/Modules/expat/xmlparse.c
@@ -1,4 +1,4 @@
-/* a30d2613dcfdef81475a9d1a349134d2d42722172fdaa7d5bb12ed2aa74b9596 (2.4.6+)
+/* fcb1a62fefa945567301146eb98e3ad3413e823a41c4378e84e8b6b6f308d824 (2.4.7+)
                             __  __            _
                          ___\ \/ /_ __   __ _| |_
                         / _ \\  /| '_ \ / _` | __|
@@ -34,6 +34,7 @@
    Copyright (c) 2019      Vadim Zeitlin <vadim at zeitlins.org>
    Copyright (c) 2021      Dong-hee Na <donghee.na at python.org>
    Copyright (c) 2022      Samanta Navarro <ferivoz at riseup.net>
+   Copyright (c) 2022      Jeffrey Walton <noloader at gmail.com>
    Licensed under the MIT license:
 
    Permission is  hereby granted,  free of charge,  to any  person obtaining
@@ -133,7 +134,7 @@
       * BSD / macOS (including <10.7) (arc4random): HAVE_ARC4RANDOM, \
       * libbsd (arc4random_buf): HAVE_ARC4RANDOM_BUF + HAVE_LIBBSD, \
       * libbsd (arc4random): HAVE_ARC4RANDOM + HAVE_LIBBSD, \
-      * Linux (including <3.17) / BSD / macOS (including <10.7) (/dev/urandom): XML_DEV_URANDOM, \
+      * Linux (including <3.17) / BSD / macOS (including <10.7) / Solaris >=8 (/dev/urandom): XML_DEV_URANDOM, \
       * Windows >=Vista (rand_s): _WIN32. \
     \
     If insist on not using any of these, bypass this error by defining \
@@ -722,6 +723,7 @@ XML_ParserCreateNS(const XML_Char *encodingName, XML_Char nsSep) {
   return XML_ParserCreate_MM(encodingName, NULL, tmp);
 }
 
+// "xml=http://www.w3.org/XML/1998/namespace"
 static const XML_Char implicitContext[]
     = {ASCII_x,     ASCII_m,     ASCII_l,      ASCII_EQUALS, ASCII_h,
        ASCII_t,     ASCII_t,     ASCII_p,      ASCII_COLON,  ASCII_SLASH,
@@ -3704,12 +3706,124 @@ storeAtts(XML_Parser parser, const ENCODING *enc, const char *attStr,
   return XML_ERROR_NONE;
 }
 
+static XML_Bool
+is_rfc3986_uri_char(XML_Char candidate) {
+  // For the RFC 3986 ANBF grammar see
+  // https://datatracker.ietf.org/doc/html/rfc3986#appendix-A
+
+  switch (candidate) {
+  // From rule "ALPHA" (uppercase half)
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'D':
+  case 'E':
+  case 'F':
+  case 'G':
+  case 'H':
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P':
+  case 'Q':
+  case 'R':
+  case 'S':
+  case 'T':
+  case 'U':
+  case 'V':
+  case 'W':
+  case 'X':
+  case 'Y':
+  case 'Z':
+
+  // From rule "ALPHA" (lowercase half)
+  case 'a':
+  case 'b':
+  case 'c':
+  case 'd':
+  case 'e':
+  case 'f':
+  case 'g':
+  case 'h':
+  case 'i':
+  case 'j':
+  case 'k':
+  case 'l':
+  case 'm':
+  case 'n':
+  case 'o':
+  case 'p':
+  case 'q':
+  case 'r':
+  case 's':
+  case 't':
+  case 'u':
+  case 'v':
+  case 'w':
+  case 'x':
+  case 'y':
+  case 'z':
+
+  // From rule "DIGIT"
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+
+  // From rule "pct-encoded"
+  case '%':
+
+  // From rule "unreserved"
+  case '-':
+  case '.':
+  case '_':
+  case '~':
+
+  // From rule "gen-delims"
+  case ':':
+  case '/':
+  case '?':
+  case '#':
+  case '[':
+  case ']':
+  case '@':
+
+  // From rule "sub-delims"
+  case '!':
+  case '$':
+  case '&':
+  case '\'':
+  case '(':
+  case ')':
+  case '*':
+  case '+':
+  case ',':
+  case ';':
+  case '=':
+    return XML_TRUE;
+
+  default:
+    return XML_FALSE;
+  }
+}
+
 /* addBinding() overwrites the value of prefix->binding without checking.
    Therefore one must keep track of the old value outside of addBinding().
 */
 static enum XML_Error
 addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId,
            const XML_Char *uri, BINDING **bindingsPtr) {
+  // "http://www.w3.org/XML/1998/namespace"
   static const XML_Char xmlNamespace[]
       = {ASCII_h,      ASCII_t,     ASCII_t,     ASCII_p,      ASCII_COLON,
          ASCII_SLASH,  ASCII_SLASH, ASCII_w,     ASCII_w,      ASCII_w,
@@ -3720,6 +3834,7 @@ addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId,
          ASCII_e,      ASCII_s,     ASCII_p,     ASCII_a,      ASCII_c,
          ASCII_e,      '\0'};
   static const int xmlLen = (int)sizeof(xmlNamespace) / sizeof(XML_Char) - 1;
+  // "http://www.w3.org/2000/xmlns/"
   static const XML_Char xmlnsNamespace[]
       = {ASCII_h,     ASCII_t,      ASCII_t, ASCII_p, ASCII_COLON,  ASCII_SLASH,
          ASCII_SLASH, ASCII_w,      ASCII_w, ASCII_w, ASCII_PERIOD, ASCII_w,
@@ -3760,14 +3875,26 @@ addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId,
         && (len > xmlnsLen || uri[len] != xmlnsNamespace[len]))
       isXMLNS = XML_FALSE;
 
-    // NOTE: While Expat does not validate namespace URIs against RFC 3986,
-    //       we have to at least make sure that the XML processor on top of
-    //       Expat (that is splitting tag names by namespace separator into
-    //       2- or 3-tuples (uri-local or uri-local-prefix)) cannot be confused
-    //       by an attacker putting additional namespace separator characters
-    //       into namespace declarations.  That would be ambiguous and not to
-    //       be expected.
-    if (parser->m_ns && (uri[len] == parser->m_namespaceSeparator)) {
+    // NOTE: While Expat does not validate namespace URIs against RFC 3986
+    //       today (and is not REQUIRED to do so with regard to the XML 1.0
+    //       namespaces specification) we have to at least make sure, that
+    //       the application on top of Expat (that is likely splitting expanded
+    //       element names ("qualified names") of form
+    //       "[uri sep] local [sep prefix] '\0'" back into 1, 2 or 3 pieces
+    //       in its element handler code) cannot be confused by an attacker
+    //       putting additional namespace separator characters into namespace
+    //       declarations.  That would be ambiguous and not to be expected.
+    //
+    //       While the HTML API docs of function XML_ParserCreateNS have been
+    //       advising against use of a namespace separator character that can
+    //       appear in a URI for >20 years now, some widespread applications
+    //       are using URI characters (':' (colon) in particular) for a
+    //       namespace separator, in practice.  To keep these applications
+    //       functional, we only reject namespaces URIs containing the
+    //       application-chosen namespace separator if the chosen separator
+    //       is a non-URI character with regard to RFC 3986.
+    if (parser->m_ns && (uri[len] == parser->m_namespaceSeparator)
+        && ! is_rfc3986_uri_char(uri[len])) {
       return XML_ERROR_SYNTAX;
     }
   }



More information about the Python-checkins mailing list