From 97e384917185ea05a5db24cebf77dffaad0d5fae Mon Sep 17 00:00:00 2001
From: Sumeet Jain
Date: Sun, 24 Jan 2016 21:46:12 -0800
Subject: [PATCH 1/6] checkpoint 1. Trying to convert what should be bytes vs
str
---
talon/html_quotations.py | 6 ++--
talon/quotations.py | 36 +++++++++++------------
talon/signature/__init__.py | 1 +
talon/signature/data/classifier | Bin 608 -> 600 bytes
talon/signature/data/classifier_01.npy | Bin 96 -> 88 bytes
talon/signature/data/classifier_02.npy | Bin 176 -> 96 bytes
talon/signature/data/classifier_03.npy | Bin 88 -> 176 bytes
talon/signature/learning/dataset.py | 6 ++--
talon/utils.py | 18 ++++++++++--
tests/html_quotations_test.py | 2 +-
tests/signature/bruteforce_test.py | 4 +--
tests/signature/extraction_test.py | 2 +-
tests/signature/learning/dataset_test.py | 2 +-
tests/signature/learning/helpers_test.py | 2 +-
tests/text_quotations_test.py | 29 +++++++++---------
tests/utils_test.py | 16 +++++-----
16 files changed, 70 insertions(+), 54 deletions(-)
diff --git a/talon/html_quotations.py b/talon/html_quotations.py
index 44afb6b2..2e3b1cac 100644
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -8,9 +8,9 @@
from talon.utils import cssselect
-CHECKPOINT_PREFIX = '#!%!'
-CHECKPOINT_SUFFIX = '!%!#'
-CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
+CHECKPOINT_PREFIX = b'#!%!'
+CHECKPOINT_SUFFIX = b'!%!#'
+CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + b'\d+' + CHECKPOINT_SUFFIX)
# HTML quote indicators (tag ids)
QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
diff --git a/talon/quotations.py b/talon/quotations.py
index 60163109..9b111065 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -25,9 +25,9 @@
RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
RE_ON_DATE_SMB_WROTE = re.compile(
- u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+ '(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
# Beginning of the line
- u'|'.join((
+ '|'.join((
# English
'On',
# French
@@ -39,25 +39,25 @@
# German
'Am',
# Norwegian
- u'På',
+ 'På',
# Swedish, Danish
'Den',
)),
# Date and sender separator
- u'|'.join((
+ '|'.join((
# most languages separate date and sender address by comma
',',
# polish date and sender address separator
- u'użytkownik'
+ 'użytkownik'
)),
# Ending of the line
- u'|'.join((
+ '|'.join((
# English
'wrote', 'sent',
# French
- u'a écrit',
+ 'a écrit',
# Polish
- u'napisał',
+ 'napisał',
# Dutch
'schreef','verzond','geschreven',
# German
@@ -68,15 +68,15 @@
))
# Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
RE_ON_DATE_WROTE_SMB = re.compile(
- u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+ '(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
# Beginning of the line
- u'|'.join((
+ '|'.join((
'Op',
#German
'Am'
)),
# Ending of the line
- u'|'.join((
+ '|'.join((
# Dutch
'schreef','verzond','geschreven',
# German
@@ -121,22 +121,22 @@
# ------Original Message------ or ---- Reply Message ----
# With variations in other languages.
-RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
- u'|'.join((
+RE_ORIGINAL_MESSAGE = re.compile('[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
+ '|'.join((
# English
'Original Message', 'Reply Message',
# German
- u'Ursprüngliche Nachricht', 'Antwort Nachricht',
+ 'Ursprüngliche Nachricht', 'Antwort Nachricht',
# Danish
'Oprindelig meddelelse',
))), re.I)
-RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
- u'|'.join((
+RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
+ '|'.join((
# "From" in different languages.
- 'From', 'Van', 'De', 'Von', 'Fra', u'Från',
+ 'From', 'Van', 'De', 'Von', 'Fra', 'Från',
# "Date" in different languages.
- 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
+ 'Date', 'Datum', 'Envoyé', 'Skickat', 'Sendt',
))), re.I)
# ---- John Smith wrote ----
diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py
index fc60e1d8..6398f522 100644
--- a/talon/signature/__init__.py
+++ b/talon/signature/__init__.py
@@ -35,5 +35,6 @@
def initialize():
+ print(EXTRACTOR_FILENAME)
extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
EXTRACTOR_DATA)
diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier
index 1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b..4ee71eec21ee1e6549a036ebe124d3c2b290df13 100644
GIT binary patch
literal 600
zcmaix!EV$r5QejxU3P(jw5lpm#IZ=_Km-MGszj@;RH>mk6bX)&lgu^-#|gHxG>1w(
zl^*8QPM?Oi;e^CPfMa$GobbW2M!)C({+YAE^5875(j}eJCXQ-QT-uhfFSiH>dBY`LU}!5z`>!9q`u5||^eGNxWn7DU=2OEp
zU!}G
z^oDAf+qQ4vrF1&@8U_KKI}NUQa`qX-abY`}
z#pW0lljXXKwcwWyoPx!R&wP7fbbBpM))@8dh`d_B#`n-HeJZnl$Fs|?W@6*fGcj9bu|)KuW9Ll{iCAu9|H486YF~QuRXDUsmOTi@Tw2?E!qod4?F{be6E?a@iCYftPiNBv)aWYq_IITJZ$fyy8DEYZwwj*7gokM(7p$Jz4J
hzM*pUm-S8i?F*eUl@_>f-w&g^9+t75GJW~o9svK-Ir9Jj
delta 50
wcmdnM7%@Rd$1O3ZI8{eMy*MMWAXVK;LBmi-Q%9jz0SRzb&N{NGT>9010AykgrvLx|
diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py
index 308995be..63c0489a 100644
--- a/talon/signature/learning/dataset.py
+++ b/talon/signature/learning/dataset.py
@@ -61,7 +61,7 @@ def parse_msg_sender(filename, sender_known=True):
if os.path.isfile(filename) and not is_sender_filename(filename):
with open(filename) as f:
msg = f.read()
- sender = u''
+ sender = ''
if sender_known:
sender_filename = build_sender_filename(filename)
if os.path.exists(sender_filename):
@@ -124,9 +124,9 @@ def build_detection_dataset(folder, dataset_filename,
"""
if os.path.exists(dataset_filename):
os.remove(dataset_filename)
- build_detection_class(os.path.join(folder, u'P'),
+ build_detection_class(os.path.join(folder, 'P'),
dataset_filename, 1)
- build_detection_class(os.path.join(folder, u'N'),
+ build_detection_class(os.path.join(folder, 'N'),
dataset_filename, -1)
diff --git a/talon/utils.py b/talon/utils.py
index e6c884bf..4ea5ab56 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -37,7 +37,7 @@ def safe_format(format_string, *args, **kwargs):
# ignore other errors
except:
- return u''
+ return ''
def to_unicode(str_or_unicode, precise=False):
@@ -115,7 +115,21 @@ def get_delimiter(msg_body):
return delimiter
-def html_tree_to_text(tree):
+def html_to_text(string):
+ """
+ Dead-simple HTML-to-text converter:
+ >>> html_to_text("one
two
three")
+ >>> "one\ntwo\nthree"
+
+ NOTES:
+ 1. the string is expected to contain UTF-8 encoded HTML!
+ 2. returns utf-8 encoded str (not unicode)
+ """
+ s = _prepend_utf8_declaration(string)
+ s = s.replace(b"\n", b"")
+
+ tree = html.fromstring(s)
+
for style in CSSSelector('style')(tree):
style.getparent().remove(style)
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index b78409bf..3aca19be 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -157,7 +157,7 @@ def test_gmail_quote_blockquote():
def test_unicode_in_reply():
- msg_body = u"""Reply \xa0 \xa0 Text
+ msg_body = """Reply \xa0 \xa0 Text
diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py
index 382615bb..9867901a 100644
--- a/tests/signature/bruteforce_test.py
+++ b/tests/signature/bruteforce_test.py
@@ -128,10 +128,10 @@ def test_blackberry_signature():
eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]),
bruteforce.extract_signature(msg_body))
- msg_body = u"""Blah
+ msg_body = """Blah
Enviado desde mi oficina móvil BlackBerry® de Telcel"""
- eq_(('Blah', u'Enviado desde mi oficina móvil BlackBerry® de Telcel'),
+ eq_(('Blah', 'Enviado desde mi oficina móvil BlackBerry® de Telcel'),
bruteforce.extract_signature(msg_body))
diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py
index b5703031..0cbcb46e 100644
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -127,7 +127,7 @@ def test_handles_unicode():
@patch.object(signature.extraction, 'has_signature')
def test_signature_extract_crash(has_signature):
has_signature.side_effect = Exception('Bam!')
- msg_body = u'Blah\r\n--\r\n\r\nСергей'
+ msg_body = 'Blah\r\n--\r\n\r\nСергей'
eq_((msg_body, None), signature.extract(msg_body, 'Сергей'))
diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py
index 8e152753..5f84bac3 100644
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -32,7 +32,7 @@ def test_parse_msg_sender():
# if the message sender is stored in a separate file
sender, msg = d.parse_msg_sender(MSG_FILENAME_WITH_BODY_SUFFIX)
with open(MSG_FILENAME_WITH_BODY_SUFFIX) as f:
- eq_(sender, u"john@example.com")
+ eq_(sender, "john@example.com")
eq_(msg, f.read())
diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py
index d9e7b866..2870c822 100644
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -151,7 +151,7 @@ def test_extract_names():
['David', 'DECOSTER', 'Domicile']
}
- for sender, expected_names in senders_names.items():
+ for sender, expected_names in list(senders_names.items()):
extracted_names = h.extract_names(sender)
# check that extracted names could be compiled
try:
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 7a81c994..2e274fba 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -54,7 +54,7 @@ def test_pattern_on_date_wrote_somebody():
"""Lorem
Op 13-02-2014 3:18 schreef Julius Caesar
:
-
+
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
"""))
@@ -137,7 +137,7 @@ def test_reply_and_quotation_splitter_share_line():
def _check_pattern_original_message(original_message_indicator):
- msg_body = u"""Test reply
+ msg_body = """Test reply
-----{}-----
@@ -145,12 +145,13 @@ def _check_pattern_original_message(original_message_indicator):
eq_('Test reply', quotations.extract_from_plain(
msg_body.format(six.text_type(original_message_indicator))))
+
def test_english_original_message():
_check_pattern_original_message('Original Message')
_check_pattern_original_message('Reply Message')
def test_german_original_message():
- _check_pattern_original_message(u'Ursprüngliche Nachricht')
+ _check_pattern_original_message('Ursprüngliche Nachricht')
_check_pattern_original_message('Antwort Nachricht')
def test_danish_original_message():
@@ -256,7 +257,7 @@ def test_with_indent():
------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
-Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
+Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
"""
eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
@@ -267,8 +268,8 @@ def test_short_quotation_with_newline():
On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" wrote:
Hi Mark,
-Blah blah?
-Thanks,Christine
+Blah blah?
+Thanks,Christine
On Jan 27, 2015, at 11:55 AM, Mark XXX wrote:
@@ -312,7 +313,7 @@ def test_german_from_block():
def test_french_multiline_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
- u"""Lorem ipsum
+ """Lorem ipsum
De : Brendan xxx [mailto:brendan.xxx@xxx.com]
Envoyé : vendredi 23 janvier 2015 16:39
@@ -324,7 +325,7 @@ def test_french_multiline_from_block():
def test_french_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
- u"""Lorem ipsum
+ """Lorem ipsum
Le 23 janv. 2015 à 22:03, Brendan xxx > a écrit:
@@ -332,7 +333,7 @@ def test_french_from_block():
def test_polish_from_block():
eq_('Lorem ipsum', quotations.extract_from_plain(
- u"""Lorem ipsum
+ """Lorem ipsum
W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx
napisał:
@@ -354,7 +355,7 @@ def test_danish_from_block():
def test_swedish_from_block():
eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
- u"""Allo! Follow up MIME!
+ """Allo! Follow up MIME!
Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
Skickat: den 26 augusti 2015 14:45
Till: Isacson Leiff
@@ -373,7 +374,7 @@ def test_swedish_from_line():
def test_norwegian_from_line():
eq_('Lorem', quotations.extract_from_plain(
- u"""Lorem
+ """Lorem
På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
@@ -381,11 +382,11 @@ def test_norwegian_from_line():
def test_dutch_from_block():
eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
- """Gluten-free culpa lo-fi et nesciunt nostrud.
+ """Gluten-free culpa lo-fi et nesciunt nostrud.
Op 17-feb.-2015, om 13:18 heeft Julius Caesar het volgende geschreven:
-
-Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
+
+Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
"""))
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 778e858c..1838db6c 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -16,15 +16,15 @@ def test_get_delimiter():
def test_unicode():
- eq_ (u'hi', u.to_unicode('hi'))
- eq_ (type(u.to_unicode('hi')), six.text_type )
- eq_ (type(u.to_unicode(u'hi')), six.text_type )
- eq_ (type(u.to_unicode('привет')), six.text_type )
- eq_ (type(u.to_unicode(u'привет')), six.text_type )
- eq_ (u"привет", u.to_unicode('привет'))
- eq_ (u"привет", u.to_unicode(u'привет'))
+ eq_ ('hi', u.to_unicode('hi'))
+ eq_ (type(u.to_unicode('hi')), str )
+ eq_ (type(u.to_unicode('hi')), str )
+ eq_ (type(u.to_unicode('привет')), str )
+ eq_ (type(u.to_unicode('привет')), str )
+ eq_ ("привет", u.to_unicode('привет'))
+ eq_ ("привет", u.to_unicode('привет'))
# some latin1 stuff
- eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
+ eq_ ("Versión", u.to_unicode('Versi\xf3n', precise=True))
def test_detect_encoding():
From f02a866f2a07b7f233c83daea46559a1453ce3c5 Mon Sep 17 00:00:00 2001
From: Sumeet Jain
Date: Sun, 24 Jan 2016 22:18:44 -0800
Subject: [PATCH 2/6] further progres. text_quotations almost work
---
talon/quotations.py | 24 ++++++++++++------------
tests/text_quotations_test.py | 9 +++++----
2 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/talon/quotations.py b/talon/quotations.py
index 9b111065..d64f4c87 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -86,7 +86,7 @@
)
RE_QUOTATION = re.compile(
- r'''
+ rb'''
(
# quotation border: splitter line or a number of quotation marker lines
(?:
@@ -107,7 +107,7 @@
''', re.VERBOSE)
RE_EMPTY_QUOTATION = re.compile(
- r'''
+ rb'''
(
# quotation border: splitter line or a number of quotation marker lines
(?:
@@ -225,11 +225,11 @@ def mark_message_lines(lines):
i = 0
while i < len(lines):
if not lines[i].strip():
- markers[i] = 'e' # empty line
+ markers[i] = ord('e') # empty line
elif QUOT_PATTERN.match(lines[i]):
- markers[i] = 'm' # line with quotation marker
+ markers[i] = ord('m') # line with quotation marker
elif RE_FWD.match(lines[i]):
- markers[i] = 'f' # ---- Forwarded message ----
+ markers[i] = ord('f') # ---- Forwarded message ----
else:
# in case splitter is spread across several lines
splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
@@ -238,13 +238,13 @@ def mark_message_lines(lines):
# append as many splitter markers as lines in splitter
splitter_lines = splitter.group().splitlines()
for j in range(len(splitter_lines)):
- markers[i + j] = 's'
+ markers[i + j] = ord('s')
# skip splitter lines
i += len(splitter_lines) - 1
else:
# probably the line from the last message in the conversation
- markers[i] = 't'
+ markers[i] = ord('t')
i += 1
return ''.join(markers)
@@ -263,17 +263,17 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
"""
markers = ''.join(markers)
# if there are no splitter there should be no markers
- if 's' not in markers and not re.search('(me*){3}', markers):
- markers = markers.replace('m', 't')
+ if b's' not in markers and not re.search(b'(me*){3}', markers):
+ markers = markers.replace(b'm', b't')
- if re.match('[te]*f', markers):
+ if re.match(b'[te]*f', markers):
return_flags[:] = [False, -1, -1]
return lines
# inlined reply
# use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
# both 't' entries should be found
- for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+ for inline_reply in re.finditer(b'(?<=m)e*((?:t+e*)+)m', markers):
# long links could break sequence of quotation lines but they shouldn't
# be considered an inline reply
links = (
@@ -284,7 +284,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
return lines
# cut out text lines coming after splitter if there are no markers there
- quotation = re.search('(se*)+((t|f)+e*)+', markers)
+ quotation = re.search(b'(se*)+((t|f)+e*)+', markers)
if quotation:
return_flags[:] = [True, quotation.start(), len(lines)]
return lines[:quotation.start()]
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 2e274fba..a6e33574 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -516,7 +516,7 @@ def test_mark_message_lines():
'> Hi',
'',
'Signature']
- eq_('tessemet', quotations.mark_message_lines(lines))
+ eq_(b'tessemet', quotations.mark_message_lines(lines))
lines = ['Just testing the email reply',
'',
@@ -530,7 +530,7 @@ def test_mark_message_lines():
'wrote:',
'',
'Tarmo Lehtpuu has posted the following message on']
- eq_('tettessset', quotations.mark_message_lines(lines))
+ eq_(b'tettessset', quotations.mark_message_lines(lines))
def test_process_marked_lines():
@@ -696,8 +696,9 @@ def test_standard_replies():
with open(filename) as f:
message = email.message_from_file(f)
body = next(email.iterators.typed_subpart_iterator(message, subtype='plain'))
- text = ''.join(body_iterator(body, True))
-
+ text = ''.join(email.iterators.body_line_iterator(body, True))
+ if not text:
+ text = ''.join(email.iterators.body_line_iterator(body, False))
stripped_text = quotations.extract_from_plain(text)
reply_text_fn = filename[:-4] + '_reply_text'
if os.path.isfile(reply_text_fn):
From 34eb4ff7f493003bacd5910e4cf4b0044e1bb12b Mon Sep 17 00:00:00 2001
From: Sumeet Jain
Date: Sat, 30 Jan 2016 14:33:24 -0800
Subject: [PATCH 3/6] passing utils
---
talon/utils.py | 8 ++++----
tests/utils_test.py | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/talon/utils.py b/talon/utils.py
index 4ea5ab56..a7d81e4f 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -126,7 +126,7 @@ def html_to_text(string):
2. returns utf-8 encoded str (not unicode)
"""
s = _prepend_utf8_declaration(string)
- s = s.replace(b"\n", b"")
+ s = s.replace("\n", "")
tree = html.fromstring(s)
@@ -223,7 +223,7 @@ def html_too_big(s):
def _contains_charset_spec(s):
"""Return True if the first 4KB contain charset spec
"""
- return s.lower().find(b'html; charset=', 0, 4096) != -1
+ return s.lower().find('html; charset=', 0, 4096) != -1
def _prepend_utf8_declaration(s):
@@ -259,8 +259,8 @@ def _html5lib_parser():
)
-_UTF8_DECLARATION = (b'')
+_UTF8_DECLARATION = ('')
_BLOCKTAGS = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 1838db6c..1003f7b5 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -79,7 +79,7 @@ def test_html_to_text():
Hi