diff --git a/talon/quotations.py b/talon/quotations.py index 60163109..f540a729 100644 --- a/talon/quotations.py +++ b/talon/quotations.py @@ -25,9 +25,9 @@ RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M) RE_ON_DATE_SMB_WROTE = re.compile( - u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( + '(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format( # Beginning of the line - u'|'.join(( + '|'.join(( # English 'On', # French @@ -39,25 +39,25 @@ # German 'Am', # Norwegian - u'På', + 'På', # Swedish, Danish 'Den', )), # Date and sender separator - u'|'.join(( + '|'.join(( # most languages separate date and sender address by comma ',', # polish date and sender address separator - u'użytkownik' + 'użytkownik' )), # Ending of the line - u'|'.join(( + '|'.join(( # English 'wrote', 'sent', # French - u'a écrit', + 'a écrit', # Polish - u'napisał', + 'napisał', # Dutch 'schreef','verzond','geschreven', # German @@ -68,15 +68,15 @@ )) # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:' RE_ON_DATE_WROTE_SMB = re.compile( - u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( + '(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format( # Beginning of the line - u'|'.join(( + '|'.join(( 'Op', #German 'Am' )), # Ending of the line - u'|'.join(( + '|'.join(( # Dutch 'schreef','verzond','geschreven', # German @@ -86,7 +86,7 @@ ) RE_QUOTATION = re.compile( - r''' + rb''' ( # quotation border: splitter line or a number of quotation marker lines (?: @@ -107,7 +107,7 @@ ''', re.VERBOSE) RE_EMPTY_QUOTATION = re.compile( - r''' + rb''' ( # quotation border: splitter line or a number of quotation marker lines (?: @@ -121,26 +121,26 @@ # ------Original Message------ or ---- Reply Message ---- # With variations in other languages. -RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format( - u'|'.join(( +RE_ORIGINAL_MESSAGE = re.compile('[\s]*[-]+[ ]*({})[ ]*[-]+'.format( + '|'.join(( # English 'Original Message', 'Reply Message', # German - u'Ursprüngliche Nachricht', 'Antwort Nachricht', + 'Ursprüngliche Nachricht', 'Antwort Nachricht', # Danish 'Oprindelig meddelelse', ))), re.I) -RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format( - u'|'.join(( +RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format( + '|'.join(( # "From" in different languages. - 'From', 'Van', 'De', 'Von', 'Fra', u'Från', + 'From', 'Van', 'De', 'Von', 'Fra', 'Från', # "Date" in different languages. - 'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt', + 'Date', 'Datum', 'Envoyé', 'Skickat', 'Sendt', ))), re.I) # ---- John Smith wrote ---- -RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format( +RE_ANDROID_WROTE = re.compile('[\s]*[-]+.*({})[ ]*[-]+'.format( u'|'.join(( # English 'wrote' @@ -183,6 +183,7 @@ RE_HEADER = re.compile(": ") + def extract_from(msg_body, content_type='text/plain'): try: if content_type == 'text/plain': @@ -221,15 +222,15 @@ def mark_message_lines(lines): >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question']) 'tsem' """ - markers = ['e' for _ in lines] + markers = [b'e' for _ in lines] i = 0 while i < len(lines): if not lines[i].strip(): - markers[i] = 'e' # empty line + markers[i] = b'e' # empty line elif QUOT_PATTERN.match(lines[i]): - markers[i] = 'm' # line with quotation marker + markers[i] = b'm' # line with quotation marker elif RE_FWD.match(lines[i]): - markers[i] = 'f' # ---- Forwarded message ---- + markers[i] = b'f' # ---- Forwarded message ---- else: # in case splitter is spread across several lines splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES])) @@ -238,16 +239,16 @@ def mark_message_lines(lines): # append as many splitter markers as lines in splitter splitter_lines = splitter.group().splitlines() for j in range(len(splitter_lines)): - markers[i + j] = 's' + markers[i + j] = b's' # skip splitter lines i += len(splitter_lines) - 1 else: # probably the line from the last message in the conversation - markers[i] = 't' + markers[i] = b't' i += 1 - return ''.join(markers) + return b''.join(markers) def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): @@ -261,19 +262,18 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): return_flags = [were_lines_deleted, first_deleted_line, last_deleted_line] """ - markers = ''.join(markers) # if there are no splitter there should be no markers - if 's' not in markers and not re.search('(me*){3}', markers): - markers = markers.replace('m', 't') + if b's' not in markers and not re.search(b'(me*){3}', markers): + markers = markers.replace(b'm', b't') - if re.match('[te]*f', markers): + if re.match(b'[te]*f', markers): return_flags[:] = [False, -1, -1] return lines # inlined reply # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm' # both 't' entries should be found - for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers): + for inline_reply in re.finditer(b'(?<=m)e*((?:t+e*)+)m', markers): # long links could break sequence of quotation lines but they shouldn't # be considered an inline reply links = ( @@ -284,7 +284,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]): return lines # cut out text lines coming after splitter if there are no markers there - quotation = re.search('(se*)+((t|f)+e*)+', markers) + quotation = re.search(b'(se*)+((t|f)+e*)+', markers) if quotation: return_flags[:] = [True, quotation.start(), len(lines)] return lines[:quotation.start()] @@ -411,7 +411,6 @@ def extract_from_html(msg_body): return result - def _extract_from_html(msg_body): """ Extract not quoted message from provided html message body @@ -489,7 +488,7 @@ def _extract_from_html(msg_body): if _readable_text_empty(html_tree_copy): return msg_body - return html.tostring(html_tree_copy) + return _html_tostring(html_tree_copy) def split_emails(msg): @@ -525,43 +524,46 @@ def _mark_quoted_email_splitlines(markers, lines): """ # Create a list of markers to easily alter specific characters markerlist = list(markers) + for i, line in enumerate(lines): - if markerlist[i] != 'm': + if markerlist[i] != b'm'[0]: continue for pattern in SPLITTER_PATTERNS: matcher = re.search(pattern, line) if matcher: - markerlist[i] = 's' + markerlist[i] = b's'[0] break - return "".join(markerlist) + return bytes(markerlist) def _correct_splitlines_in_headers(markers, lines): """ Corrects markers by removing splitlines deemed to be inside header blocks. """ - updated_markers = "" + updated_markers = b"" i = 0 in_header_block = False for m in markers: # Only set in_header_block flag when we hit an 's' and line is a header - if m == 's': + m = bytes([m]) + if m == b"s": if not in_header_block: if bool(re.search(RE_HEADER, lines[i])): in_header_block = True else: if QUOT_PATTERN.match(lines[i]): - m = 'm' + m = b"m" else: - m = 't' + m = b"t" # If the line is not a header line, set in_header_block false. if not bool(re.search(RE_HEADER, lines[i])): in_header_block = False # Add the marker to the new updated markers string. + print(updated_markers, m) updated_markers += m i += 1 @@ -598,3 +600,6 @@ def register_xpath_extensions(): ns.prefix = 'mg' ns['text_content'] = text_content ns['tail'] = tail + +def _html_tostring(html_tree): + return html.tostring(html_tree).decode('utf-8') diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py index fc60e1d8..6398f522 100644 --- a/talon/signature/__init__.py +++ b/talon/signature/__init__.py @@ -35,5 +35,6 @@ def initialize(): + print(EXTRACTOR_FILENAME) extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME, EXTRACTOR_DATA) diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py index 7f666bd9..bfc72f2d 100644 --- a/talon/signature/bruteforce.py +++ b/talon/signature/bruteforce.py @@ -50,7 +50,7 @@ # c - could be signature line # d - line starts with dashes (could be signature or list item) # l - long line -RE_SIGNATURE_CANDIDATE = re.compile(r''' +RE_SIGNATURE_CANDIDATE = re.compile(br''' (?Pc+d)[^d] | (?Pc+d)$ @@ -163,16 +163,16 @@ def _mark_candidate_indexes(lines, candidate): 'cdc' """ # at first consider everything to be potential signature lines - markers = bytearray('c'*len(candidate)) + markers = bytearray('c'*len(candidate), 'utf-8') # mark lines starting from bottom up for i, line_idx in reversed(list(enumerate(candidate))): if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE: - markers[i] = 'l' + markers[i] = ord(b'l') else: line = lines[line_idx].strip() if line.startswith('-') and line.strip("-"): - markers[i] = 'd' + markers[i] = ord(b'd') return markers diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier index 1c3a4b08..4ee71eec 100644 Binary files a/talon/signature/data/classifier and b/talon/signature/data/classifier differ diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy index 11d13026..ea117aac 100644 Binary files a/talon/signature/data/classifier_01.npy and b/talon/signature/data/classifier_01.npy differ diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy index 2cec7290..11d13026 100644 Binary files a/talon/signature/data/classifier_02.npy and b/talon/signature/data/classifier_02.npy differ diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy index e5762ae5..77af8e37 100644 Binary files a/talon/signature/data/classifier_03.npy and b/talon/signature/data/classifier_03.npy differ diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py index 32591717..20263285 100644 --- a/talon/signature/extraction.py +++ b/talon/signature/extraction.py @@ -18,7 +18,7 @@ # regex signature pattern for reversed lines # assumes that all long lines have been excluded -RE_REVERSE_SIGNATURE = re.compile(r''' +RE_REVERSE_SIGNATURE = re.compile(br''' # signature should consists of blocks like this (?: # it could end with empty line @@ -81,7 +81,7 @@ def _mark_lines(lines, sender): candidate = get_signature_candidate(lines) # at first consider everything to be text no signature - markers = bytearray('t'*len(lines)) + markers = bytearray('t'*len(lines), 'utf-8') # mark lines starting from bottom up # mark only lines that belong to candidate @@ -92,9 +92,9 @@ def _mark_lines(lines, sender): # relative to lines not candidate j = len(lines) - len(candidate) + i if not line.strip(): - markers[j] = 'e' + markers[j] = ord(b'e') elif is_signature_line(line, sender, EXTRACTOR): - markers[j] = 's' + markers[j] = ord(b's') return markers diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py index 308995be..63c0489a 100644 --- a/talon/signature/learning/dataset.py +++ b/talon/signature/learning/dataset.py @@ -61,7 +61,7 @@ def parse_msg_sender(filename, sender_known=True): if os.path.isfile(filename) and not is_sender_filename(filename): with open(filename) as f: msg = f.read() - sender = u'' + sender = '' if sender_known: sender_filename = build_sender_filename(filename) if os.path.exists(sender_filename): @@ -124,9 +124,9 @@ def build_detection_dataset(folder, dataset_filename, """ if os.path.exists(dataset_filename): os.remove(dataset_filename) - build_detection_class(os.path.join(folder, u'P'), + build_detection_class(os.path.join(folder, 'P'), dataset_filename, 1) - build_detection_class(os.path.join(folder, u'N'), + build_detection_class(os.path.join(folder, 'N'), dataset_filename, -1) diff --git a/talon/utils.py b/talon/utils.py index e6c884bf..a4dfb3bd 100644 --- a/talon/utils.py +++ b/talon/utils.py @@ -7,6 +7,7 @@ import cchardet import regex as re +import lxml.html from lxml.html import html5parser from lxml.cssselect import CSSSelector @@ -37,7 +38,7 @@ def safe_format(format_string, *args, **kwargs): # ignore other errors except: - return u'' + return '' def to_unicode(str_or_unicode, precise=False): @@ -177,11 +178,13 @@ def html_to_text(string): def html_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ + if isinstance(s, bytes): + s = s.decode() try: if html_too_big(s): return None - return html5parser.fromstring(s, parser=_html5lib_parser()) + return lxml.html.document_fromstring(s, ensure_head_body=True) #html5parser.fromstring(s, parser=_html5lib_parser()) except Exception: pass @@ -189,11 +192,13 @@ def html_fromstring(s): def html_document_fromstring(s): """Parse html tree from string. Return None if the string can't be parsed. """ + if isinstance(s, bytes): + s = s.decode() try: if html_too_big(s): return None - return html5parser.document_fromstring(s, parser=_html5lib_parser()) + return lxml.html.document_fromstring(s, ensure_head_body=True) #html5parser.document_fromstring(s, parser=_html5lib_parser()) except Exception: pass diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py index b78409bf..3f7c4389 100644 --- a/tests/html_quotations_test.py +++ b/tests/html_quotations_test.py @@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote(): """ - eq_("Reply", + eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote(): """ - eq_("Reply", + eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -62,7 +62,7 @@ def test_regular_blockquote(): """ - eq_("Reply
Regular
", + eq_("

Reply

Regular
", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -129,7 +129,7 @@ def test_gmail_quote(): """ - eq_("Reply", + eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -140,7 +140,7 @@ def test_gmail_quote_compact(): '
Test
' \ '' \ '' - eq_("Reply", + eq_("

Reply

", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -157,7 +157,7 @@ def test_gmail_quote_blockquote(): def test_unicode_in_reply(): - msg_body = u"""Reply \xa0 \xa0 Text
+ msg_body = """Reply \xa0 \xa0 Text

@@ -165,9 +165,9 @@ def test_unicode_in_reply():
Quote -
""".encode("utf-8") +""" - eq_("Reply  Text

" + eq_("

Reply  Text


" "", RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -298,7 +298,7 @@ def test_from_block_and_quotations_in_separate_divs():
''' - eq_('Reply

', + eq_('

Reply


', RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body))) @@ -373,9 +373,9 @@ def test_CRLF(): """ msg_body = msg_body.replace('\n', '\r\n') extracted = quotations.extract_from_html(msg_body) - assert_false(symbol in extracted) - # Keep new lines otherwise "My reply" becomes one word - "Myreply" - eq_("My\nreply\n", extracted) + assert_false(symbol in extracted) + # Keep new lines otherwise "My reply" becomes one word - "Myreply" + eq_("

My\nreply\n

", extracted) def test_gmail_forwarded_msg(): diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py index 382615bb..65b53242 100644 --- a/tests/signature/bruteforce_test.py +++ b/tests/signature/bruteforce_test.py @@ -128,10 +128,10 @@ def test_blackberry_signature(): eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]), bruteforce.extract_signature(msg_body)) - msg_body = u"""Blah + msg_body = """Blah Enviado desde mi oficina móvil BlackBerry® de Telcel""" - eq_(('Blah', u'Enviado desde mi oficina móvil BlackBerry® de Telcel'), + eq_(('Blah', 'Enviado desde mi oficina móvil BlackBerry® de Telcel'), bruteforce.extract_signature(msg_body)) @@ -200,14 +200,14 @@ def test_get_signature_candidate(): def test_mark_candidate_indexes(): with patch.object(bruteforce, 'TOO_LONG_SIGNATURE_LINE', 3): # spaces are not considered when checking line length - eq_('clc', + eq_(b'clc', bruteforce._mark_candidate_indexes( ['BR, ', 'long', 'Bob'], [0, 1, 2])) # only candidate lines are marked # if line has only dashes it's a candidate line - eq_('ccdc', + eq_(b'ccdc', bruteforce._mark_candidate_indexes( ['-', 'long', '-', '- i', 'Bob'], [0, 2, 3, 4])) @@ -216,20 +216,20 @@ def test_mark_candidate_indexes(): def test_process_marked_candidate_indexes(): eq_([2, 13, 15], bruteforce._process_marked_candidate_indexes( - [2, 13, 15], 'dcc')) + [2, 13, 15], b'dcc')) eq_([15], bruteforce._process_marked_candidate_indexes( - [2, 13, 15], 'ddc')) + [2, 13, 15], b'ddc')) eq_([13, 15], bruteforce._process_marked_candidate_indexes( - [13, 15], 'cc')) + [13, 15], b'cc')) eq_([15], bruteforce._process_marked_candidate_indexes( - [15], 'lc')) + [15], b'lc')) eq_([15], bruteforce._process_marked_candidate_indexes( - [13, 15], 'ld')) + [13, 15], b'ld')) diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py index b5703031..1720100e 100644 --- a/tests/signature/extraction_test.py +++ b/tests/signature/extraction_test.py @@ -127,7 +127,7 @@ def test_handles_unicode(): @patch.object(signature.extraction, 'has_signature') def test_signature_extract_crash(has_signature): has_signature.side_effect = Exception('Bam!') - msg_body = u'Blah\r\n--\r\n\r\nСергей' + msg_body = 'Blah\r\n--\r\n\r\nСергей' eq_((msg_body, None), signature.extract(msg_body, 'Сергей')) @@ -135,7 +135,7 @@ def test_mark_lines(): with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 2): # we analyse the 2nd line as well though it's the 6th line # (starting from the bottom) because we don't count empty line - eq_('ttset', + eq_(b'ttset', e._mark_lines(['Bob Smith', 'Bob Smith', 'Bob Smith', @@ -145,7 +145,7 @@ def test_mark_lines(): with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3): # we don't analyse the 1st line because # signature cant start from the 1st line - eq_('tset', + eq_(b'tset', e._mark_lines(['Bob Smith', 'Bob Smith', '', @@ -154,20 +154,20 @@ def test_mark_lines(): def test_process_marked_lines(): # no signature found - eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt')) + eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), b'telt')) # signature in the middle of the text - eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt')) + eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), b'tesestelt')) # long line splits signature eq_((list(range(7)), [7, 8]), - e._process_marked_lines(list(range(9)), 'tsslsless')) + e._process_marked_lines(list(range(9)), b'tsslsless')) eq_((list(range(20)), [20]), - e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets')) + e._process_marked_lines(list(range(21)), b'ttttttstttesllelelets')) # some signature lines could be identified as text - eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest')) + eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), b'tsetetest')) eq_(([], list(range(5))), - e._process_marked_lines(list(range(5)), "ststt")) + e._process_marked_lines(list(range(5)), b"ststt")) diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py index 8e152753..5f84bac3 100644 --- a/tests/signature/learning/dataset_test.py +++ b/tests/signature/learning/dataset_test.py @@ -32,7 +32,7 @@ def test_parse_msg_sender(): # if the message sender is stored in a separate file sender, msg = d.parse_msg_sender(MSG_FILENAME_WITH_BODY_SUFFIX) with open(MSG_FILENAME_WITH_BODY_SUFFIX) as f: - eq_(sender, u"john@example.com") + eq_(sender, "john@example.com") eq_(msg, f.read()) diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py index d9e7b866..2870c822 100644 --- a/tests/signature/learning/helpers_test.py +++ b/tests/signature/learning/helpers_test.py @@ -151,7 +151,7 @@ def test_extract_names(): ['David', 'DECOSTER', 'Domicile'] } - for sender, expected_names in senders_names.items(): + for sender, expected_names in list(senders_names.items()): extracted_names = h.extract_names(sender) # check that extracted names could be compiled try: diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py index 7a81c994..d6722369 100644 --- a/tests/text_quotations_test.py +++ b/tests/text_quotations_test.py @@ -54,7 +54,7 @@ def test_pattern_on_date_wrote_somebody(): """Lorem Op 13-02-2014 3:18 schreef Julius Caesar : - + Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. """)) @@ -137,7 +137,7 @@ def test_reply_and_quotation_splitter_share_line(): def _check_pattern_original_message(original_message_indicator): - msg_body = u"""Test reply + msg_body = """Test reply -----{}----- @@ -145,12 +145,13 @@ def _check_pattern_original_message(original_message_indicator): eq_('Test reply', quotations.extract_from_plain( msg_body.format(six.text_type(original_message_indicator)))) + def test_english_original_message(): _check_pattern_original_message('Original Message') _check_pattern_original_message('Reply Message') def test_german_original_message(): - _check_pattern_original_message(u'Ursprüngliche Nachricht') + _check_pattern_original_message('Ursprüngliche Nachricht') _check_pattern_original_message('Antwort Nachricht') def test_danish_original_message(): @@ -256,7 +257,7 @@ def test_with_indent(): ------On 12/29/1987 17:32 PM, Julius Caesar wrote----- -Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. +Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. """ eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body)) @@ -267,8 +268,8 @@ def test_short_quotation_with_newline(): On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" wrote: Hi Mark, -Blah blah?  -Thanks,Christine  +Blah blah? +Thanks,Christine On Jan 27, 2015, at 11:55 AM, Mark XXX wrote: @@ -312,7 +313,7 @@ def test_german_from_block(): def test_french_multiline_from_block(): eq_('Lorem ipsum', quotations.extract_from_plain( - u"""Lorem ipsum + """Lorem ipsum De : Brendan xxx [mailto:brendan.xxx@xxx.com] Envoyé : vendredi 23 janvier 2015 16:39 @@ -324,7 +325,7 @@ def test_french_multiline_from_block(): def test_french_from_block(): eq_('Lorem ipsum', quotations.extract_from_plain( - u"""Lorem ipsum + """Lorem ipsum Le 23 janv. 2015 à 22:03, Brendan xxx > a écrit: @@ -332,7 +333,7 @@ def test_french_from_block(): def test_polish_from_block(): eq_('Lorem ipsum', quotations.extract_from_plain( - u"""Lorem ipsum + """Lorem ipsum W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx napisał: @@ -354,7 +355,7 @@ def test_danish_from_block(): def test_swedish_from_block(): eq_('Allo! Follow up MIME!', quotations.extract_from_plain( - u"""Allo! Follow up MIME! + """Allo! Follow up MIME! Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com] Skickat: den 26 augusti 2015 14:45 Till: Isacson Leiff @@ -373,7 +374,7 @@ def test_swedish_from_line(): def test_norwegian_from_line(): eq_('Lorem', quotations.extract_from_plain( - u"""Lorem + """Lorem På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev: Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse. @@ -381,11 +382,11 @@ def test_norwegian_from_line(): def test_dutch_from_block(): eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain( - """Gluten-free culpa lo-fi et nesciunt nostrud. + """Gluten-free culpa lo-fi et nesciunt nostrud. Op 17-feb.-2015, om 13:18 heeft Julius Caesar het volgende geschreven: - -Small batch beard laboris tempor, non listicle hella Tumblr heirloom. + +Small batch beard laboris tempor, non listicle hella Tumblr heirloom. """)) @@ -515,7 +516,7 @@ def test_mark_message_lines(): '> Hi', '', 'Signature'] - eq_('tessemet', quotations.mark_message_lines(lines)) + eq_(b'tessemet', quotations.mark_message_lines(lines)) lines = ['Just testing the email reply', '', @@ -529,41 +530,40 @@ def test_mark_message_lines(): 'wrote:', '', 'Tarmo Lehtpuu has posted the following message on'] - eq_('tettessset', quotations.mark_message_lines(lines)) + eq_(b'tettessset', quotations.mark_message_lines(lines)) def test_process_marked_lines(): # quotations and last message lines are mixed # consider all to be a last message - markers = 'tsemmtetm' - lines = [str(i) for i in range(len(markers))] + markers = b'tsemmtetm' lines = [str(i) for i in range(len(markers))] eq_(lines, quotations.process_marked_lines(lines, markers)) # no splitter => no markers - markers = 'tmm' + markers = b'tmm' lines = ['1', '2', '3'] eq_(['1', '2', '3'], quotations.process_marked_lines(lines, markers)) # text after splitter without markers is quotation - markers = 'tst' + markers = b'tst' lines = ['1', '2', '3'] eq_(['1'], quotations.process_marked_lines(lines, markers)) # message + quotation + signature - markers = 'tsmt' + markers = b'tsmt' lines = ['1', '2', '3', '4'] eq_(['1', '4'], quotations.process_marked_lines(lines, markers)) # message + + nested quotation - markers = 'tstsmt' + markers = b'tstsmt' lines = ['1', '2', '3', '4', '5', '6'] eq_(['1'], quotations.process_marked_lines(lines, markers)) # test links wrapped with paranthesis # link starts on the marker line - markers = 'tsmttem' + markers = b'tsmttem' lines = ['text', 'splitter', '>View (http://example.com', @@ -574,7 +574,7 @@ def test_process_marked_lines(): eq_(lines[:1], quotations.process_marked_lines(lines, markers)) # link starts on the new line - markers = 'tmmmtm' + markers = b'tmmmtm' lines = ['text', '>' '>', @@ -585,7 +585,7 @@ def test_process_marked_lines(): eq_(lines[:1], quotations.process_marked_lines(lines, markers)) # check all "inline" replies - markers = 'tsmtmtm' + markers = b'tsmtmtm' lines = ['text', 'splitter', '>', @@ -596,7 +596,7 @@ def test_process_marked_lines(): eq_(lines, quotations.process_marked_lines(lines, markers)) # inline reply with link not wrapped in paranthesis - markers = 'tsmtm' + markers = b'tsmtm' lines = ['text', 'splitter', '>', @@ -605,7 +605,7 @@ def test_process_marked_lines(): eq_(lines, quotations.process_marked_lines(lines, markers)) # inline reply with link wrapped in paranthesis - markers = 'tsmtm' + markers = b'tsmtm' lines = ['text', 'splitter', '>', @@ -695,8 +695,9 @@ def test_standard_replies(): with open(filename) as f: message = email.message_from_file(f) body = next(email.iterators.typed_subpart_iterator(message, subtype='plain')) - text = ''.join(body_iterator(body, True)) - + text = ''.join(email.iterators.body_line_iterator(body, True)) + if not text: + text = ''.join(email.iterators.body_line_iterator(body, False)) stripped_text = quotations.extract_from_plain(text) reply_text_fn = filename[:-4] + '_reply_text' if os.path.isfile(reply_text_fn): @@ -754,6 +755,6 @@ def test_split_email(): > > """ - expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" + expected_markers = b"stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm" markers = quotations.split_emails(msg) eq_(markers, expected_markers) diff --git a/tests/utils_test.py b/tests/utils_test.py index 778e858c..138338ab 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -16,15 +16,15 @@ def test_get_delimiter(): def test_unicode(): - eq_ (u'hi', u.to_unicode('hi')) - eq_ (type(u.to_unicode('hi')), six.text_type ) - eq_ (type(u.to_unicode(u'hi')), six.text_type ) - eq_ (type(u.to_unicode('привет')), six.text_type ) - eq_ (type(u.to_unicode(u'привет')), six.text_type ) - eq_ (u"привет", u.to_unicode('привет')) - eq_ (u"привет", u.to_unicode(u'привет')) + eq_ ('hi', u.to_unicode('hi')) + eq_ (type(u.to_unicode('hi')), str ) + eq_ (type(u.to_unicode('hi')), str ) + eq_ (type(u.to_unicode('привет')), str ) + eq_ (type(u.to_unicode('привет')), str ) + eq_ ("привет", u.to_unicode('привет')) + eq_ ("привет", u.to_unicode('привет')) # some latin1 stuff - eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True)) + eq_ ("Versión", u.to_unicode('Versi\xf3n', precise=True)) def test_detect_encoding(): @@ -79,7 +79,7 @@ def test_html_to_text(): """ text = u.html_to_text(html) eq_(b"Hello world! \n\n * One! \n * Two \nHaha", text) - eq_(u"привет!", u.html_to_text("привет!").decode('utf8')) + eq_("привет!".encode('utf-8'), u.html_to_text("привет!")) html = '

Hi' eq_ (b'Hi', u.html_to_text(html)) @@ -115,7 +115,7 @@ def test_html_to_text(): def test_comment_no_parent(): s = " no comment" d = u.html_document_fromstring(s) - eq_("no comment", u.html_tree_to_text(d)) + eq_(b"no comment", u.html_tree_to_text(d)) @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception())) @@ -156,5 +156,5 @@ def test_html_too_big(): @patch.object(u, '_MAX_TAGS_COUNT', 3) def test_html_to_text(): - eq_("Hello", u.html_to_text("
Hello
")) + eq_(b"Hello", u.html_to_text("
Hello
")) eq_(None, u.html_to_text("
Hi
"))