Update python libs (#11669)

* Update feedparser * Update markdown2
pymedusa · Mar 27, 2024 · 50a9e00 · 50a9e00
1 parent 805353e
commit 50a9e00
Show file tree

Hide file tree

Showing 33 changed files with 100 additions and 58 deletions.
diff --git a/ext/feedparser/__init__.py b/ext/feedparser/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@@ -32,7 +32,7 @@
 
 __author__ = 'Kurt McKee <[email protected]>'
 __license__ = 'BSD 2-clause'
-__version__ = '6.0.10'
+__version__ = '6.0.11'
 
 # HTTP "User-Agent" header to send to servers when downloading feeds.
 # If you are embedding feedparser in a larger application, you should

diff --git a/ext/feedparser/api.py b/ext/feedparser/api.py
@@ -1,5 +1,5 @@
 # The public API for feedparser
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/__init__.py b/ext/feedparser/datetimes/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/asctime.py b/ext/feedparser/datetimes/asctime.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/greek.py b/ext/feedparser/datetimes/greek.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/hungarian.py b/ext/feedparser/datetimes/hungarian.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/iso8601.py b/ext/feedparser/datetimes/iso8601.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/korean.py b/ext/feedparser/datetimes/korean.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/perforce.py b/ext/feedparser/datetimes/perforce.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/rfc822.py b/ext/feedparser/datetimes/rfc822.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/datetimes/w3dtf.py b/ext/feedparser/datetimes/w3dtf.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/encodings.py b/ext/feedparser/encodings.py
@@ -1,5 +1,5 @@
 # Character encoding routines
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #
@@ -26,9 +26,9 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 # POSSIBILITY OF SUCH DAMAGE.
 
-import cgi
 import codecs
 import re
+import typing as t
 
 try:
     try:
@@ -68,6 +68,30 @@ def lazy_chardet_encoding(data):
 RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')
 
 
+def parse_content_type(line: str) -> t.Tuple[str, str]:
+    """Parse an HTTP Content-Type header.
+
+    The return value will be a tuple of strings:
+    the MIME type, and the value of the "charset" (if any).
+
+    This is a custom replacement for Python's cgi.parse_header().
+    The cgi module will be removed in Python 3.13.
+    """
+
+    chunks = line.split(";")
+    if not chunks:
+        return "", ""
+
+    mime_type = chunks[0].strip()
+    charset_value = ""
+    for chunk in chunks[1:]:
+        key, _, value = chunk.partition("=")
+        if key.strip().lower() == "charset":
+            charset_value = value.strip().strip("\"'")
+
+    return mime_type, charset_value
+
+
 def convert_to_utf8(http_headers, data, result):
     """Detect and convert the character encoding to UTF-8.
 
@@ -181,10 +205,7 @@ def convert_to_utf8(http_headers, data, result):
     # XML declaration encoding, and HTTP encoding, following the
     # heuristic defined in RFC 3023.
     http_content_type = http_headers.get('content-type') or ''
-    http_content_type, params = cgi.parse_header(http_content_type)
-    http_encoding = params.get('charset', '').replace("'", "")
-    if isinstance(http_encoding, bytes):
-        http_encoding = http_encoding.decode('utf-8', 'ignore')
+    http_content_type, http_encoding = parse_content_type(http_content_type)
 
     acceptable_content_type = 0
     application_content_types = ('application/xml', 'application/xml-dtd',

diff --git a/ext/feedparser/exceptions.py b/ext/feedparser/exceptions.py
@@ -1,5 +1,5 @@
 # Exceptions used throughout feedparser
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/html.py b/ext/feedparser/html.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/http.py b/ext/feedparser/http.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/mixin.py b/ext/feedparser/mixin.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/_base.py b/ext/feedparser/namespaces/_base.py
@@ -1,5 +1,5 @@
 # Support for the Atom, RSS, RDF, and CDF feed formats
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/admin.py b/ext/feedparser/namespaces/admin.py
@@ -1,5 +1,5 @@
 # Support for the administrative elements extension
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/cc.py b/ext/feedparser/namespaces/cc.py
@@ -1,5 +1,5 @@
 # Support for the Creative Commons licensing extensions
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/dc.py b/ext/feedparser/namespaces/dc.py
@@ -1,5 +1,5 @@
 # Support for the Dublin Core metadata extensions
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/georss.py b/ext/feedparser/namespaces/georss.py
@@ -1,5 +1,5 @@
 # Support for the GeoRSS format
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/itunes.py b/ext/feedparser/namespaces/itunes.py
@@ -1,5 +1,5 @@
 # Support for the iTunes format
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/mediarss.py b/ext/feedparser/namespaces/mediarss.py
@@ -1,5 +1,5 @@
 # Support for the Media RSS format
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/namespaces/psc.py b/ext/feedparser/namespaces/psc.py
@@ -1,5 +1,5 @@
 # Support for the Podlove Simple Chapters format
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/parsers/loose.py b/ext/feedparser/parsers/loose.py
@@ -1,5 +1,5 @@
 # The loose feed parser that interfaces with an SGML parsing library
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/parsers/strict.py b/ext/feedparser/parsers/strict.py
@@ -1,5 +1,5 @@
 # The strict feed parser that interfaces with an XML parsing library
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/sanitizer.py b/ext/feedparser/sanitizer.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/sgml.py b/ext/feedparser/sgml.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/urls.py b/ext/feedparser/urls.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/feedparser/util.py b/ext/feedparser/util.py
@@ -1,4 +1,4 @@
-# Copyright 2010-2022 Kurt McKee <[email protected]>
+# Copyright 2010-2023 Kurt McKee <[email protected]>
 # Copyright 2002-2008 Mark Pilgrim
 # All rights reserved.
 #

diff --git a/ext/markdown2.py b/ext/markdown2.py
@@ -106,7 +106,7 @@
 #   not yet sure if there implications with this. Compare 'pydoc sre'
 #   and 'perldoc perlre'.
 
-__version_info__ = (2, 4, 11)
+__version_info__ = (2, 4, 13)
 __version__ = '.'.join(map(str, __version_info__))
 __author__ = "Trent Mick"
 
@@ -783,8 +783,15 @@ def _detab(self, text):
     def _hash_html_block_sub(self, match, raw=False):
         if isinstance(match, str):
             html = match
+            tag = None
         else:
             html = match.group(1)
+            try:
+                tag = match.group(2)
+            except IndexError:
+                tag = None
+
+        tag = tag or re.match(r'.*?<(\S).*?>', html).group(1)
 
         if raw and self.safe_mode:
             html = self._sanitize_html(html)
@@ -793,9 +800,17 @@ def _hash_html_block_sub(self, match, raw=False):
             m = self._html_markdown_attr_re.search(first_line)
             if m:
                 lines = html.split('\n')
+                # if MD is on same line as opening tag then split across two lines
+                lines = list(filter(None, (re.split(r'(.*?<%s.*markdown=.*?>)' % tag, lines[0])))) + lines[1:]
+                # if MD on same line as closing tag, split across two lines
+                lines = lines[:-1] + list(filter(None, re.split(r'(\s*?</%s>.*?$)' % tag, lines[-1])))
+                # extract key sections of the match
+                first_line = lines[0]
                 middle = '\n'.join(lines[1:-1])
                 last_line = lines[-1]
+                # remove `markdown="1"` attr from tag
                 first_line = first_line[:m.start()] + first_line[m.end():]
+                # hash the HTML segments to protect them
                 f_key = _hash_text(first_line)
                 self.html_blocks[f_key] = first_line
                 l_key = _hash_text(last_line)
@@ -1238,24 +1253,24 @@ def _do_tables(self, text):
         """
         less_than_tab = self.tab_width - 1
         table_re = re.compile(r'''
-                (?:(?<=\n\n)|\A\n?)             # leading blank line
+                (?:(?<=\n)|\A\n?)             # leading blank line
 
                 ^[ ]{0,%d}                      # allowed whitespace
-                (.*[|].*)  \n                   # $1: header row (at least one pipe)
+                (.*[|].*)[ ]*\n                   # $1: header row (at least one pipe)
 
                 ^[ ]{0,%d}                      # allowed whitespace
                 (                               # $2: underline row
                     # underline row with leading bar
-                    (?:  \|\ *:?-+:?\ *  )+  \|? \s? \n
+                    (?:  \|\ *:?-+:?\ *  )+  \|? \s?[ ]*\n
                     |
                     # or, underline row without leading bar
-                    (?:  \ *:?-+:?\ *\|  )+  (?:  \ *:?-+:?\ *  )? \s? \n
+                    (?:  \ *:?-+:?\ *\|  )+  (?:  \ *:?-+:?\ *  )? \s?[ ]*\n
                 )
 
                 (                               # $3: data rows
                     (?:
                         ^[ ]{0,%d}(?!\ )         # ensure line begins with 0 to less_than_tab spaces
-                        .*\|.*  \n
+                        .*\|.*[ ]*\n
                     )+
                 )
             ''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
@@ -1351,15 +1366,20 @@ def _run_span_gamut(self, text):
             text = self._do_smart_punctuation(text)
 
         # Do hard breaks:
-        if 'breaks' in self.extras:
-            break_tag = "<br%s\n" % self.empty_element_suffix
-            # do backslashes first because on_newline inserts the break before the newline
-            if self.extras['breaks'].get('on_backslash', False):
-                text = re.sub(r' *\\\n', break_tag, text)
-            if self.extras['breaks'].get('on_newline', False):
-                text = re.sub(r" *\n(?!\<(?:\/?(ul|ol|li))\>)", break_tag, text)
+        on_backslash = self.extras.get('breaks', {}).get('on_backslash', False)
+        on_newline = self.extras.get('breaks', {}).get('on_newline', False)
+
+        if on_backslash and on_newline:
+            pattern = r' *\\?'
+        elif on_backslash:
+            pattern = r'(?: *\\| {2,})'
+        elif on_newline:
+            pattern = r' *'
         else:
-            text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
+            pattern = r' {2,}'
+
+        break_tag = "<br%s\n" % self.empty_element_suffix
+        text = re.sub(pattern + r"\n(?!\<(?:\/?(ul|ol|li))\>)", break_tag, text)
 
         return text
 
@@ -1775,7 +1795,8 @@ def _do_links(self, text):
                             curr_pos = start_idx + 1
                     else:
                         # This id isn't defined, leave the markup alone.
-                        curr_pos = match.end()
+                        # set current pos to end of link title and continue from there
+                        curr_pos = p
                     continue
 
             # Otherwise, it isn't markup.
@@ -2395,9 +2416,9 @@ def _do_tg_spoiler(self, text):
         text = self._tg_spoiler_re.sub(r"<tg-spoiler>\1</tg-spoiler>", text)
         return text
 
-    _strong_re = re.compile(r"(\*\*|__)(?=\S)(.*\S)\1", re.S)
+    _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S)
     _em_re = r"(\*|_)(?=\S)(.*?\S)\1"
-    _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.*\S)\*\*", re.S)
+    _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]?)(?<=\S)\*\*", re.S)
     _code_friendly_em_re = r"\*(?=\S)(.+?)\*"
     def _do_italics_and_bold(self, text):
         if self.extras.get('middle-word-em', True) is False:
@@ -2623,7 +2644,7 @@ def _encode_amps_and_angles(self, text):
         text = self._naked_gt_re.sub('&gt;', text)
         return text
 
-    _incomplete_tags_re = re.compile(r"<(/?\w+?(?!\w)\s*?.+?[\s/]+?)")
+    _incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?[\s/]+?)")
 
     def _encode_incomplete_tags(self, text):
         if self.safe_mode not in ("replace", "escape"):