Skip to content

Commit

Permalink
Update python libs (#11669)
Browse files Browse the repository at this point in the history
* Update feedparser

* Update markdown2
  • Loading branch information
medariox authored Mar 27, 2024
1 parent 805353e commit 50a9e00
Show file tree
Hide file tree
Showing 33 changed files with 100 additions and 58 deletions.
4 changes: 2 additions & 2 deletions ext/feedparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down Expand Up @@ -32,7 +32,7 @@

__author__ = 'Kurt McKee <[email protected]>'
__license__ = 'BSD 2-clause'
__version__ = '6.0.10'
__version__ = '6.0.11'

# HTTP "User-Agent" header to send to servers when downloading feeds.
# If you are embedding feedparser in a larger application, you should
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# The public API for feedparser
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/asctime.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/greek.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/hungarian.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/iso8601.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/korean.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/perforce.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/rfc822.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/datetimes/w3dtf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
33 changes: 27 additions & 6 deletions ext/feedparser/encodings.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Character encoding routines
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand All @@ -26,9 +26,9 @@
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import cgi
import codecs
import re
import typing as t

try:
try:
Expand Down Expand Up @@ -68,6 +68,30 @@ def lazy_chardet_encoding(data):
RE_XML_PI_ENCODING = re.compile(br'^<\?.*encoding=[\'"](.*?)[\'"].*\?>')


def parse_content_type(line: str) -> t.Tuple[str, str]:
"""Parse an HTTP Content-Type header.
The return value will be a tuple of strings:
the MIME type, and the value of the "charset" (if any).
This is a custom replacement for Python's cgi.parse_header().
The cgi module will be removed in Python 3.13.
"""

chunks = line.split(";")
if not chunks:
return "", ""

mime_type = chunks[0].strip()
charset_value = ""
for chunk in chunks[1:]:
key, _, value = chunk.partition("=")
if key.strip().lower() == "charset":
charset_value = value.strip().strip("\"'")

return mime_type, charset_value


def convert_to_utf8(http_headers, data, result):
"""Detect and convert the character encoding to UTF-8.
Expand Down Expand Up @@ -181,10 +205,7 @@ def convert_to_utf8(http_headers, data, result):
# XML declaration encoding, and HTTP encoding, following the
# heuristic defined in RFC 3023.
http_content_type = http_headers.get('content-type') or ''
http_content_type, params = cgi.parse_header(http_content_type)
http_encoding = params.get('charset', '').replace("'", "")
if isinstance(http_encoding, bytes):
http_encoding = http_encoding.decode('utf-8', 'ignore')
http_content_type, http_encoding = parse_content_type(http_content_type)

acceptable_content_type = 0
application_content_types = ('application/xml', 'application/xml-dtd',
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/exceptions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Exceptions used throughout feedparser
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/http.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/mixin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the Atom, RSS, RDF, and CDF feed formats
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/admin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the administrative elements extension
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/cc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the Creative Commons licensing extensions
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/dc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the Dublin Core metadata extensions
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/georss.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the GeoRSS format
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/itunes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the iTunes format
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/mediarss.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the Media RSS format
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/namespaces/psc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Support for the Podlove Simple Chapters format
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/parsers/loose.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# The loose feed parser that interfaces with an SGML parsing library
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/parsers/strict.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# The strict feed parser that interfaces with an XML parsing library
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/sanitizer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/sgml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/urls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
2 changes: 1 addition & 1 deletion ext/feedparser/util.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2010-2022 Kurt McKee <[email protected]>
# Copyright 2010-2023 Kurt McKee <[email protected]>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
Expand Down
57 changes: 39 additions & 18 deletions ext/markdown2.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
# not yet sure if there implications with this. Compare 'pydoc sre'
# and 'perldoc perlre'.

__version_info__ = (2, 4, 11)
__version_info__ = (2, 4, 13)
__version__ = '.'.join(map(str, __version_info__))
__author__ = "Trent Mick"

Expand Down Expand Up @@ -783,8 +783,15 @@ def _detab(self, text):
def _hash_html_block_sub(self, match, raw=False):
if isinstance(match, str):
html = match
tag = None
else:
html = match.group(1)
try:
tag = match.group(2)
except IndexError:
tag = None

tag = tag or re.match(r'.*?<(\S).*?>', html).group(1)

if raw and self.safe_mode:
html = self._sanitize_html(html)
Expand All @@ -793,9 +800,17 @@ def _hash_html_block_sub(self, match, raw=False):
m = self._html_markdown_attr_re.search(first_line)
if m:
lines = html.split('\n')
# if MD is on same line as opening tag then split across two lines
lines = list(filter(None, (re.split(r'(.*?<%s.*markdown=.*?>)' % tag, lines[0])))) + lines[1:]
# if MD on same line as closing tag, split across two lines
lines = lines[:-1] + list(filter(None, re.split(r'(\s*?</%s>.*?$)' % tag, lines[-1])))
# extract key sections of the match
first_line = lines[0]
middle = '\n'.join(lines[1:-1])
last_line = lines[-1]
# remove `markdown="1"` attr from tag
first_line = first_line[:m.start()] + first_line[m.end():]
# hash the HTML segments to protect them
f_key = _hash_text(first_line)
self.html_blocks[f_key] = first_line
l_key = _hash_text(last_line)
Expand Down Expand Up @@ -1238,24 +1253,24 @@ def _do_tables(self, text):
"""
less_than_tab = self.tab_width - 1
table_re = re.compile(r'''
(?:(?<=\n\n)|\A\n?) # leading blank line
(?:(?<=\n)|\A\n?) # leading blank line
^[ ]{0,%d} # allowed whitespace
(.*[|].*) \n # $1: header row (at least one pipe)
(.*[|].*)[ ]*\n # $1: header row (at least one pipe)
^[ ]{0,%d} # allowed whitespace
( # $2: underline row
# underline row with leading bar
(?: \|\ *:?-+:?\ * )+ \|? \s? \n
(?: \|\ *:?-+:?\ * )+ \|? \s?[ ]*\n
|
# or, underline row without leading bar
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s? \n
(?: \ *:?-+:?\ *\| )+ (?: \ *:?-+:?\ * )? \s?[ ]*\n
)
( # $3: data rows
(?:
^[ ]{0,%d}(?!\ ) # ensure line begins with 0 to less_than_tab spaces
.*\|.* \n
.*\|.*[ ]*\n
)+
)
''' % (less_than_tab, less_than_tab, less_than_tab), re.M | re.X)
Expand Down Expand Up @@ -1351,15 +1366,20 @@ def _run_span_gamut(self, text):
text = self._do_smart_punctuation(text)

# Do hard breaks:
if 'breaks' in self.extras:
break_tag = "<br%s\n" % self.empty_element_suffix
# do backslashes first because on_newline inserts the break before the newline
if self.extras['breaks'].get('on_backslash', False):
text = re.sub(r' *\\\n', break_tag, text)
if self.extras['breaks'].get('on_newline', False):
text = re.sub(r" *\n(?!\<(?:\/?(ul|ol|li))\>)", break_tag, text)
on_backslash = self.extras.get('breaks', {}).get('on_backslash', False)
on_newline = self.extras.get('breaks', {}).get('on_newline', False)

if on_backslash and on_newline:
pattern = r' *\\?'
elif on_backslash:
pattern = r'(?: *\\| {2,})'
elif on_newline:
pattern = r' *'
else:
text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
pattern = r' {2,}'

break_tag = "<br%s\n" % self.empty_element_suffix
text = re.sub(pattern + r"\n(?!\<(?:\/?(ul|ol|li))\>)", break_tag, text)

return text

Expand Down Expand Up @@ -1775,7 +1795,8 @@ def _do_links(self, text):
curr_pos = start_idx + 1
else:
# This id isn't defined, leave the markup alone.
curr_pos = match.end()
# set current pos to end of link title and continue from there
curr_pos = p
continue

# Otherwise, it isn't markup.
Expand Down Expand Up @@ -2395,9 +2416,9 @@ def _do_tg_spoiler(self, text):
text = self._tg_spoiler_re.sub(r"<tg-spoiler>\1</tg-spoiler>", text)
return text

_strong_re = re.compile(r"(\*\*|__)(?=\S)(.*\S)\1", re.S)
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S)
_em_re = r"(\*|_)(?=\S)(.*?\S)\1"
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.*\S)\*\*", re.S)
_code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]?)(?<=\S)\*\*", re.S)
_code_friendly_em_re = r"\*(?=\S)(.+?)\*"
def _do_italics_and_bold(self, text):
if self.extras.get('middle-word-em', True) is False:
Expand Down Expand Up @@ -2623,7 +2644,7 @@ def _encode_amps_and_angles(self, text):
text = self._naked_gt_re.sub('&gt;', text)
return text

_incomplete_tags_re = re.compile(r"<(/?\w+?(?!\w)\s*?.+?[\s/]+?)")
_incomplete_tags_re = re.compile(r"<(!--|/?\w+?(?!\w)\s*?.+?[\s/]+?)")

def _encode_incomplete_tags(self, text):
if self.safe_mode not in ("replace", "escape"):
Expand Down
Loading

0 comments on commit 50a9e00

Please sign in to comment.