From 97e384917185ea05a5db24cebf77dffaad0d5fae Mon Sep 17 00:00:00 2001
From: Sumeet Jain <sumeetjain@berkeley.edu>
Date: Sun, 24 Jan 2016 21:46:12 -0800
Subject: [PATCH 1/6] checkpoint 1. Trying to convert what should be bytes vs
 str

---
 talon/html_quotations.py                 |   6 ++--
 talon/quotations.py                      |  36 +++++++++++------------
 talon/signature/__init__.py              |   1 +
 talon/signature/data/classifier          | Bin 608 -> 600 bytes
 talon/signature/data/classifier_01.npy   | Bin 96 -> 88 bytes
 talon/signature/data/classifier_02.npy   | Bin 176 -> 96 bytes
 talon/signature/data/classifier_03.npy   | Bin 88 -> 176 bytes
 talon/signature/learning/dataset.py      |   6 ++--
 talon/utils.py                           |  18 ++++++++++--
 tests/html_quotations_test.py            |   2 +-
 tests/signature/bruteforce_test.py       |   4 +--
 tests/signature/extraction_test.py       |   2 +-
 tests/signature/learning/dataset_test.py |   2 +-
 tests/signature/learning/helpers_test.py |   2 +-
 tests/text_quotations_test.py            |  29 +++++++++---------
 tests/utils_test.py                      |  16 +++++-----
 16 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/talon/html_quotations.py b/talon/html_quotations.py
index 44afb6b2..2e3b1cac 100644
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -8,9 +8,9 @@
 
 from talon.utils import cssselect 
 
-CHECKPOINT_PREFIX = '#!%!'
-CHECKPOINT_SUFFIX = '!%!#'
-CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
+CHECKPOINT_PREFIX = b'#!%!'
+CHECKPOINT_SUFFIX = b'!%!#'
+CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + b'\d+' + CHECKPOINT_SUFFIX)
 
 # HTML quote indicators (tag ids)
 QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
diff --git a/talon/quotations.py b/talon/quotations.py
index 60163109..9b111065 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -25,9 +25,9 @@
 RE_FWD = re.compile("^[-]+[ ]*Forwarded message[ ]*[-]+$", re.I | re.M)
 
 RE_ON_DATE_SMB_WROTE = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
+    '(-*[>]?[ ]?({0})[ ].*({1})(.*\n){{0,2}}.*({2}):?-*)'.format(
         # Beginning of the line
-        u'|'.join((
+        '|'.join((
             # English
             'On',
             # French
@@ -39,25 +39,25 @@
             # German
             'Am',
             # Norwegian
-            u'På',
+            'På',
             # Swedish, Danish
             'Den',
         )),
         # Date and sender separator
-        u'|'.join((
+        '|'.join((
             # most languages separate date and sender address by comma
             ',',
             # polish date and sender address separator
-            u'użytkownik'
+            'użytkownik'
         )),
         # Ending of the line
-        u'|'.join((
+        '|'.join((
             # English
             'wrote', 'sent',
             # French
-            u'a écrit',
+            'a écrit',
             # Polish
-            u'napisał',
+            'napisał',
             # Dutch
             'schreef','verzond','geschreven',
             # German
@@ -68,15 +68,15 @@
     ))
 # Special case for languages where text is translated like this: 'on {date} wrote {somebody}:'
 RE_ON_DATE_WROTE_SMB = re.compile(
-    u'(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
+    '(-*[>]?[ ]?({0})[ ].*(.*\n){{0,2}}.*({1})[ ]*.*:)'.format(
         # Beginning of the line
-        u'|'.join((
+        '|'.join((
         	'Op',
         	#German
         	'Am'
         )),
         # Ending of the line
-        u'|'.join((
+        '|'.join((
             # Dutch
             'schreef','verzond','geschreven',
             # German
@@ -121,22 +121,22 @@
 
 # ------Original Message------ or ---- Reply Message ----
 # With variations in other languages.
-RE_ORIGINAL_MESSAGE = re.compile(u'[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
-    u'|'.join((
+RE_ORIGINAL_MESSAGE = re.compile('[\s]*[-]+[ ]*({})[ ]*[-]+'.format(
+    '|'.join((
         # English
         'Original Message', 'Reply Message',
         # German
-        u'Ursprüngliche Nachricht', 'Antwort Nachricht',
+        'Ursprüngliche Nachricht', 'Antwort Nachricht',
         # Danish
         'Oprindelig meddelelse',
     ))), re.I)
 
-RE_FROM_COLON_OR_DATE_COLON = re.compile(u'(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]?.*'.format(
-    u'|'.join((
+RE_FROM_COLON_OR_DATE_COLON = re.compile('(_+\r?\n)?[\s]*(:?[*]?{})[\s]?:[*]? .*'.format(
+    '|'.join((
         # "From" in different languages.
-        'From', 'Van', 'De', 'Von', 'Fra', u'Från',
+        'From', 'Van', 'De', 'Von', 'Fra', 'Från',
         # "Date" in different languages.
-        'Date', 'Datum', u'Envoyé', 'Skickat', 'Sendt',
+        'Date', 'Datum', 'Envoyé', 'Skickat', 'Sendt',
     ))), re.I)
 
 # ---- John Smith wrote ----
diff --git a/talon/signature/__init__.py b/talon/signature/__init__.py
index fc60e1d8..6398f522 100644
--- a/talon/signature/__init__.py
+++ b/talon/signature/__init__.py
@@ -35,5 +35,6 @@
 
 
 def initialize():
+    print(EXTRACTOR_FILENAME)
     extraction.EXTRACTOR = classifier.load(EXTRACTOR_FILENAME,
                                            EXTRACTOR_DATA)
diff --git a/talon/signature/data/classifier b/talon/signature/data/classifier
index 1c3a4b0865f3e951b1b3b17fb31bacc48d8d005b..4ee71eec21ee1e6549a036ebe124d3c2b290df13 100644
GIT binary patch
literal 600
zcmaix!EV$r5QejxU3P(jw5lpm#IZ=_Km-MGszj@;RH>mk6bX)&lgu^-#|gHxG>1w(
zl^*8QPM?Oi;e^CPfMa$GobbW2M!)C({+YAE^<h8o2UpEU30$kl%*!Z~+!!!~x8Dfm
zs&C&->5875(j}eJCXQ-QT-uhfFSiH>dBY`LU}!5z`>!9q`u5||^eGNxWn7DU=2OEp
z<g5_t7zle)+TYn&xQ^za)5-wF0p1bHg3h3}jPUUi2TokkxtZugl}eFDswwN1)gp7e
zg!hxZU9EZhUh}#J{Ws;<G^r0lIP{+gM=Ix?+R1P{6Vk;fU6YBIiJ1V+o;{CL-BOd{
z<PVR@xODUs_i#mXl~*M*mRq0%$0a{u!ZFg}_s|*mw+pc5Eu*Z`5S5Lz;<qv0>U!}G
z^oDAf+qQ<T(ibn?B;wu|I-<=u#NIX~NpkSh9e>4vrF1&@8U_KKI}NUQa`qX-abY`}
z#pW0lljXXKwc<gs{%^>wWyoPx!R&wP7fbbBpM))@8dh`d_B#`n-HeJZnl$<aVw3A|

literal 608
zcmZut%Z}496iu5B4QUIMH^cBApu;1A0kJ{?RHTszlf|G5OUp^!iNSGlZKq5Ym0$z&
zReTI9Bt8OMchm*qE-d-#yzcqZ&h#h2sFIPM6;URr)=Y<QxwN}?@5do}TVJs?$My(1
zqSP9F7-+1iViS^cxzDfztue0cyxjEMKyV~N@YqV59T=u06*~G@rh=!Dtc$8470;}B
zn7n?a6m32zT2)Mi*tM3y9FLD6&~@tMFw7h!loPtZhS-B($_10OVAzMDL-8qRiadH8
z$*RFYokDlUR9b3=Ct?qVT-p>Fs|?W@6*fGcj9bu|)KuW9Ll{iCAu9|H4<RV%0pXT}
zr+0t((3S2(@XRtyl$DA_MJ)_phFZu{d%)SFuoKf%EW~rr=1=i_?8U9`cwrAOj$$8L
z&+$@{ByaxMaZl_STu-6%!}mPA{0%qu@CtM*d!aB5UWK;U#%s`Xc4S$~b@0oX!W+<6
zR8Go*XhRLdn+f=)6nJar#rm_Kzc<DYVJ$1!l;CE*==Sy;?~L*=@8$m#x9y6%Yl{n4
hJrEo@($*aB{g?E_k?vbk-=!j-8Q;$hK1k{``U^Y((T)HB

diff --git a/talon/signature/data/classifier_01.npy b/talon/signature/data/classifier_01.npy
index 11d130269642d0ec81c199f8cb370f603d1e5da9..ea117aacf24f6d9163404cd2f41c7257e003a3fa 100644
GIT binary patch
delta 32
ecmYd@m|!bysH3T)P^*9hxGo-05vY`YwI2X_IS70J

delta 40
dcmaz@m|&}Hq@$^$P^*9hxER3T!+t1j4*;LS2VnpJ

diff --git a/talon/signature/data/classifier_02.npy b/talon/signature/data/classifier_02.npy
index 2cec7290a8fe3f33ddbaa3ea9d539fb58f1ad3ac..11d130269642d0ec81c199f8cb370f603d1e5da9 100644
GIT binary patch
delta 58
vcmdnMm@q-d#4RzWI8{eMy*MMWAXVK;LBmK#Q%9jz0SRz1fWe3TP}&{<v}+8~

delta 139
zcmYe;z&JstKBTBLRYyU+I3uwjRozNK!%#=T&`489p;iG7xQ<EK-_}0))_&uZtNTwK
zVmPoXYnPwy-Cy>86YF~QuRXDUsmOTi@Tw2?E!q<Mcq@O{Gu5a%9}szC-+BB$H)qRF
hdx?~(!DUVR?cWKim@RPMz8^-fHvht!WAgI5JpgIoIvW50

diff --git a/talon/signature/data/classifier_03.npy b/talon/signature/data/classifier_03.npy
index e5762ae5b684a67b4ad4e2b5bc42615db45d2e46..77af8e37a1f789e129b21f1583ff46d817e12656 100644
GIT binary patch
delta 139
zcma#pz&JstKBTBLRYyU+I3uwjRozNK!%#=T&`489p;iG7xP)|FOmt4ZwO2E`lzH|L
z!-1o>od4?F{be6E?a@iCYftPiNBv)aWYq_IITJZ$fyy8DEYZwwj*7gokM(7p$Jz4J
hzM*pUm-S8i?F*eUl@_>f-w&g^9+t75GJW~o9svK-Ir9Jj

delta 50
wcmdnM7%@Rd$1O3ZI8{eMy*MMWAXVK;LBmi-Q%9jz0SRzb&N{NGT>9010AykgrvLx|

diff --git a/talon/signature/learning/dataset.py b/talon/signature/learning/dataset.py
index 308995be..63c0489a 100644
--- a/talon/signature/learning/dataset.py
+++ b/talon/signature/learning/dataset.py
@@ -61,7 +61,7 @@ def parse_msg_sender(filename, sender_known=True):
     if os.path.isfile(filename) and not is_sender_filename(filename):
         with open(filename) as f:
             msg = f.read()
-            sender = u''
+            sender = ''
             if sender_known:
                 sender_filename = build_sender_filename(filename)
                 if os.path.exists(sender_filename):
@@ -124,9 +124,9 @@ def build_detection_dataset(folder, dataset_filename,
     """
     if os.path.exists(dataset_filename):
         os.remove(dataset_filename)
-    build_detection_class(os.path.join(folder, u'P'),
+    build_detection_class(os.path.join(folder, 'P'),
                           dataset_filename, 1)
-    build_detection_class(os.path.join(folder, u'N'),
+    build_detection_class(os.path.join(folder, 'N'),
                           dataset_filename, -1)
 
 
diff --git a/talon/utils.py b/talon/utils.py
index e6c884bf..4ea5ab56 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -37,7 +37,7 @@ def safe_format(format_string, *args, **kwargs):
 
     # ignore other errors
     except:
-        return u''
+        return ''
 
 
 def to_unicode(str_or_unicode, precise=False):
@@ -115,7 +115,21 @@ def get_delimiter(msg_body):
     return delimiter
 
 
-def html_tree_to_text(tree):
+def html_to_text(string):
+    """
+    Dead-simple HTML-to-text converter:
+        >>> html_to_text("one<br>two<br>three")
+        >>> "one\ntwo\nthree"
+
+    NOTES:
+        1. the string is expected to contain UTF-8 encoded HTML!
+        2. returns utf-8 encoded str (not unicode)
+    """
+    s = _prepend_utf8_declaration(string)
+    s = s.replace(b"\n", b"")
+
+    tree = html.fromstring(s)
+
     for style in CSSSelector('style')(tree):
         style.getparent().remove(style)
 
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index b78409bf..3aca19be 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -157,7 +157,7 @@ def test_gmail_quote_blockquote():
 
 
 def test_unicode_in_reply():
-    msg_body = u"""Reply \xa0 \xa0 Text<br>
+    msg_body = """Reply \xa0 \xa0 Text<br>
 
 <div>
   <br>
diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py
index 382615bb..9867901a 100644
--- a/tests/signature/bruteforce_test.py
+++ b/tests/signature/bruteforce_test.py
@@ -128,10 +128,10 @@ def test_blackberry_signature():
     eq_(('Heeyyoooo.', msg_body[len('Heeyyoooo.\n'):]),
         bruteforce.extract_signature(msg_body))
 
-    msg_body = u"""Blah
+    msg_body = """Blah
 Enviado desde mi oficina mÃ³vil BlackBerryÂ® de Telcel"""
 
-    eq_(('Blah', u'Enviado desde mi oficina mÃ³vil BlackBerryÂ® de Telcel'),
+    eq_(('Blah', 'Enviado desde mi oficina mÃ³vil BlackBerryÂ® de Telcel'),
         bruteforce.extract_signature(msg_body))
 
 
diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py
index b5703031..0cbcb46e 100644
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -127,7 +127,7 @@ def test_handles_unicode():
 @patch.object(signature.extraction, 'has_signature')
 def test_signature_extract_crash(has_signature):
     has_signature.side_effect = Exception('Bam!')
-    msg_body = u'Blah\r\n--\r\n\r\nСергей'
+    msg_body = 'Blah\r\n--\r\n\r\nСергей'
     eq_((msg_body, None), signature.extract(msg_body, 'Сергей'))
 
 
diff --git a/tests/signature/learning/dataset_test.py b/tests/signature/learning/dataset_test.py
index 8e152753..5f84bac3 100644
--- a/tests/signature/learning/dataset_test.py
+++ b/tests/signature/learning/dataset_test.py
@@ -32,7 +32,7 @@ def test_parse_msg_sender():
     # if the message sender is stored in a separate file
     sender, msg = d.parse_msg_sender(MSG_FILENAME_WITH_BODY_SUFFIX)
     with open(MSG_FILENAME_WITH_BODY_SUFFIX) as f:
-        eq_(sender, u"john@example.com")
+        eq_(sender, "john@example.com")
         eq_(msg, f.read())
 
 
diff --git a/tests/signature/learning/helpers_test.py b/tests/signature/learning/helpers_test.py
index d9e7b866..2870c822 100644
--- a/tests/signature/learning/helpers_test.py
+++ b/tests/signature/learning/helpers_test.py
@@ -151,7 +151,7 @@ def test_extract_names():
         ['David', 'DECOSTER', 'Domicile']
         }
 
-    for sender, expected_names in senders_names.items():
+    for sender, expected_names in list(senders_names.items()):
         extracted_names = h.extract_names(sender)
         # check that extracted names could be compiled
         try:
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 7a81c994..2e274fba 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -54,7 +54,7 @@ def test_pattern_on_date_wrote_somebody():
     """Lorem
 
 Op 13-02-2014 3:18 schreef Julius Caesar <pantheon@rome.com>:
-    
+
 Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
 """))
 
@@ -137,7 +137,7 @@ def test_reply_and_quotation_splitter_share_line():
 
 
 def _check_pattern_original_message(original_message_indicator):
-    msg_body = u"""Test reply
+    msg_body = """Test reply
 
 -----{}-----
 
@@ -145,12 +145,13 @@ def _check_pattern_original_message(original_message_indicator):
     eq_('Test reply', quotations.extract_from_plain(
         msg_body.format(six.text_type(original_message_indicator))))
 
+
 def test_english_original_message():
     _check_pattern_original_message('Original Message')
     _check_pattern_original_message('Reply Message')
 
 def test_german_original_message():
-    _check_pattern_original_message(u'Ursprüngliche Nachricht')
+    _check_pattern_original_message('Ursprüngliche Nachricht')
     _check_pattern_original_message('Antwort Nachricht')
 
 def test_danish_original_message():
@@ -256,7 +257,7 @@ def test_with_indent():
 
 ------On 12/29/1987 17:32 PM, Julius Caesar wrote-----
 
-Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur. 
+Brunch mumblecore pug Marfa tofu, irure taxidermy hoodie readymade pariatur.
     """
     eq_("YOLO salvia cillum kogi typewriter mumblecore cardigan skateboard Austin.", quotations.extract_from_plain(msg_body))
 
@@ -267,8 +268,8 @@ def test_short_quotation_with_newline():
 On Tue, Jan 27, 2015 at 12:42 PM -0800, "Company" <christine.XXX@XXX.com> wrote:
 
 Hi Mark,
-Blah blah? 
-Thanks,Christine 
+Blah blah?
+Thanks,Christine
 
 On Jan 27, 2015, at 11:55 AM, Mark XXX <mark@XXX.com> wrote:
 
@@ -312,7 +313,7 @@ def test_german_from_block():
 
 def test_french_multiline_from_block():
     eq_('Lorem ipsum', quotations.extract_from_plain(
-    u"""Lorem ipsum
+    """Lorem ipsum
 
 De : Brendan xxx [mailto:brendan.xxx@xxx.com]
 Envoyé : vendredi 23 janvier 2015 16:39
@@ -324,7 +325,7 @@ def test_french_multiline_from_block():
 
 def test_french_from_block():
     eq_('Lorem ipsum', quotations.extract_from_plain(
-    u"""Lorem ipsum
+    """Lorem ipsum
 
 Le 23 janv. 2015 à 22:03, Brendan xxx <brendan.xxx@xxx.com<mailto:brendan.xxx@xxx.com>> a écrit:
 
@@ -332,7 +333,7 @@ def test_french_from_block():
 
 def test_polish_from_block():
     eq_('Lorem ipsum', quotations.extract_from_plain(
-    u"""Lorem ipsum
+    """Lorem ipsum
 
 W dniu 28 stycznia 2015 01:53 użytkownik Zoe xxx <zoe.xxx@xxx.com>
 napisał:
@@ -354,7 +355,7 @@ def test_danish_from_block():
 
 def test_swedish_from_block():
     eq_('Allo! Follow up MIME!', quotations.extract_from_plain(
-    u"""Allo! Follow up MIME!
+    """Allo! Follow up MIME!
 Från: Anno Sportel [mailto:anno.spoel@hsbcssad.com]
 Skickat: den 26 augusti 2015 14:45
 Till: Isacson Leiff
@@ -373,7 +374,7 @@ def test_swedish_from_line():
 
 def test_norwegian_from_line():
     eq_('Lorem', quotations.extract_from_plain(
-    u"""Lorem
+    """Lorem
 På 14 september 2015 på 02:23:18, Valentino Rudy (valentino@rudy.be) skrev:
 
 Veniam laborum mlkshk kale chips authentic. Normcore mumblecore laboris, fanny pack readymade eu blog chia pop-up freegan enim master cleanse.
@@ -381,11 +382,11 @@ def test_norwegian_from_line():
 
 def test_dutch_from_block():
     eq_('Gluten-free culpa lo-fi et nesciunt nostrud.', quotations.extract_from_plain(
-    """Gluten-free culpa lo-fi et nesciunt nostrud. 
+    """Gluten-free culpa lo-fi et nesciunt nostrud.
 
 Op 17-feb.-2015, om 13:18 heeft Julius Caesar <pantheon@rome.com> het volgende geschreven:
-    
-Small batch beard laboris tempor, non listicle hella Tumblr heirloom. 
+
+Small batch beard laboris tempor, non listicle hella Tumblr heirloom.
 """))
 
 
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 778e858c..1838db6c 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -16,15 +16,15 @@ def test_get_delimiter():
 
 
 def test_unicode():
-    eq_ (u'hi', u.to_unicode('hi'))
-    eq_ (type(u.to_unicode('hi')), six.text_type )
-    eq_ (type(u.to_unicode(u'hi')), six.text_type )
-    eq_ (type(u.to_unicode('привет')), six.text_type )
-    eq_ (type(u.to_unicode(u'привет')), six.text_type )
-    eq_ (u"привет", u.to_unicode('привет'))
-    eq_ (u"привет", u.to_unicode(u'привет'))
+    eq_ ('hi', u.to_unicode('hi'))
+    eq_ (type(u.to_unicode('hi')), str )
+    eq_ (type(u.to_unicode('hi')), str )
+    eq_ (type(u.to_unicode('привет')), str )
+    eq_ (type(u.to_unicode('привет')), str )
+    eq_ ("привет", u.to_unicode('привет'))
+    eq_ ("привет", u.to_unicode('привет'))
     # some latin1 stuff
-    eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
+    eq_ ("Versión", u.to_unicode('Versi\xf3n', precise=True))
 
 
 def test_detect_encoding():

From f02a866f2a07b7f233c83daea46559a1453ce3c5 Mon Sep 17 00:00:00 2001
From: Sumeet Jain <sumeetjain@berkeley.edu>
Date: Sun, 24 Jan 2016 22:18:44 -0800
Subject: [PATCH 2/6] further progres. text_quotations almost work

---
 talon/quotations.py           | 24 ++++++++++++------------
 tests/text_quotations_test.py |  9 +++++----
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 9b111065..d64f4c87 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -86,7 +86,7 @@
     )
 
 RE_QUOTATION = re.compile(
-    r'''
+    rb'''
     (
         # quotation border: splitter line or a number of quotation marker lines
         (?:
@@ -107,7 +107,7 @@
     ''', re.VERBOSE)
 
 RE_EMPTY_QUOTATION = re.compile(
-    r'''
+    rb'''
     (
         # quotation border: splitter line or a number of quotation marker lines
         (?:
@@ -225,11 +225,11 @@ def mark_message_lines(lines):
     i = 0
     while i < len(lines):
         if not lines[i].strip():
-            markers[i] = 'e'  # empty line
+            markers[i] = ord('e')  # empty line
         elif QUOT_PATTERN.match(lines[i]):
-            markers[i] = 'm'  # line with quotation marker
+            markers[i] = ord('m')  # line with quotation marker
         elif RE_FWD.match(lines[i]):
-            markers[i] = 'f'  # ---- Forwarded message ----
+            markers[i] = ord('f')  # ---- Forwarded message ----
         else:
             # in case splitter is spread across several lines
             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
@@ -238,13 +238,13 @@ def mark_message_lines(lines):
                 # append as many splitter markers as lines in splitter
                 splitter_lines = splitter.group().splitlines()
                 for j in range(len(splitter_lines)):
-                    markers[i + j] = 's'
+                    markers[i + j] = ord('s')
 
                 # skip splitter lines
                 i += len(splitter_lines) - 1
             else:
                 # probably the line from the last message in the conversation
-                markers[i] = 't'
+                markers[i] = ord('t')
         i += 1
 
     return ''.join(markers)
@@ -263,17 +263,17 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
     """
     markers = ''.join(markers)
     # if there are no splitter there should be no markers
-    if 's' not in markers and not re.search('(me*){3}', markers):
-        markers = markers.replace('m', 't')
+    if b's' not in markers and not re.search(b'(me*){3}', markers):
+        markers = markers.replace(b'm', b't')
 
-    if re.match('[te]*f', markers):
+    if re.match(b'[te]*f', markers):
         return_flags[:] = [False, -1, -1]
         return lines
 
     # inlined reply
     # use lookbehind assertions to find overlapping entries e.g. for 'mtmtm'
     # both 't' entries should be found
-    for inline_reply in re.finditer('(?<=m)e*((?:t+e*)+)m', markers):
+    for inline_reply in re.finditer(b'(?<=m)e*((?:t+e*)+)m', markers):
         # long links could break sequence of quotation lines but they shouldn't
         # be considered an inline reply
         links = (
@@ -284,7 +284,7 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
             return lines
 
     # cut out text lines coming after splitter if there are no markers there
-    quotation = re.search('(se*)+((t|f)+e*)+', markers)
+    quotation = re.search(b'(se*)+((t|f)+e*)+', markers)
     if quotation:
         return_flags[:] = [True, quotation.start(), len(lines)]
         return lines[:quotation.start()]
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index 2e274fba..a6e33574 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -516,7 +516,7 @@ def test_mark_message_lines():
              '> Hi',
              '',
              'Signature']
-    eq_('tessemet', quotations.mark_message_lines(lines))
+    eq_(b'tessemet', quotations.mark_message_lines(lines))
 
     lines = ['Just testing the email reply',
              '',
@@ -530,7 +530,7 @@ def test_mark_message_lines():
              'wrote:',
              '',
              'Tarmo Lehtpuu has posted the following message on']
-    eq_('tettessset', quotations.mark_message_lines(lines))
+    eq_(b'tettessset', quotations.mark_message_lines(lines))
 
 
 def test_process_marked_lines():
@@ -696,8 +696,9 @@ def test_standard_replies():
         with open(filename) as f:
             message = email.message_from_file(f)
             body = next(email.iterators.typed_subpart_iterator(message, subtype='plain'))
-            text = ''.join(body_iterator(body, True))
-
+            text = ''.join(email.iterators.body_line_iterator(body, True))
+            if not text:
+              text = ''.join(email.iterators.body_line_iterator(body, False))
             stripped_text = quotations.extract_from_plain(text)
             reply_text_fn = filename[:-4] + '_reply_text'
             if os.path.isfile(reply_text_fn):

From 34eb4ff7f493003bacd5910e4cf4b0044e1bb12b Mon Sep 17 00:00:00 2001
From: Sumeet Jain <sumeetjain@berkeley.edu>
Date: Sat, 30 Jan 2016 14:33:24 -0800
Subject: [PATCH 3/6] passing utils

---
 talon/utils.py      | 8 ++++----
 tests/utils_test.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/talon/utils.py b/talon/utils.py
index 4ea5ab56..a7d81e4f 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -126,7 +126,7 @@ def html_to_text(string):
         2. returns utf-8 encoded str (not unicode)
     """
     s = _prepend_utf8_declaration(string)
-    s = s.replace(b"\n", b"")
+    s = s.replace("\n", "")
 
     tree = html.fromstring(s)
 
@@ -223,7 +223,7 @@ def html_too_big(s):
 def _contains_charset_spec(s):
     """Return True if the first 4KB contain charset spec
     """
-    return s.lower().find(b'html; charset=', 0, 4096) != -1
+    return s.lower().find('html; charset=', 0, 4096) != -1
 
 
 def _prepend_utf8_declaration(s):
@@ -259,8 +259,8 @@ def _html5lib_parser():
     )
 
 
-_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
-                     b'charset=utf-8">')
+_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
+                     'charset=utf-8">')
 
 
 _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 1838db6c..1003f7b5 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -79,7 +79,7 @@ def test_html_to_text():
 </body>"""
     text = u.html_to_text(html)
     eq_(b"Hello world! \n\n  * One! \n  * Two \nHaha", text)
-    eq_(u"привет!", u.html_to_text("<b>привет!</b>").decode('utf8'))
+    eq_("привет!".encode('utf-8'), u.html_to_text("<b>привет!</b>"))
 
     html = '<body><br/><br/>Hi</body>'
     eq_ (b'Hi', u.html_to_text(html))

From 895a3fa9d5e30d1d814a7a489ffcc8bd5b205fa7 Mon Sep 17 00:00:00 2001
From: Sumeet Jain <sumeetjain@berkeley.edu>
Date: Sat, 30 Jan 2016 15:25:35 -0800
Subject: [PATCH 4/6] mostly all tests are passing

---
 talon/html_quotations.py           |  6 +++---
 talon/quotations.py                |  5 ++++-
 talon/signature/bruteforce.py      |  8 ++++----
 talon/signature/extraction.py      |  8 ++++----
 talon/utils.py                     | 16 +---------------
 tests/html_quotations_test.py      |  6 +++---
 tests/signature/bruteforce_test.py | 14 +++++++-------
 tests/signature/extraction_test.py | 16 ++++++++--------
 tests/text_quotations_test.py      | 21 ++++++++++-----------
 9 files changed, 44 insertions(+), 56 deletions(-)

diff --git a/talon/html_quotations.py b/talon/html_quotations.py
index 2e3b1cac..44afb6b2 100644
--- a/talon/html_quotations.py
+++ b/talon/html_quotations.py
@@ -8,9 +8,9 @@
 
 from talon.utils import cssselect 
 
-CHECKPOINT_PREFIX = b'#!%!'
-CHECKPOINT_SUFFIX = b'!%!#'
-CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + b'\d+' + CHECKPOINT_SUFFIX)
+CHECKPOINT_PREFIX = '#!%!'
+CHECKPOINT_SUFFIX = '!%!#'
+CHECKPOINT_PATTERN = re.compile(CHECKPOINT_PREFIX + '\d+' + CHECKPOINT_SUFFIX)
 
 # HTML quote indicators (tag ids)
 QUOTE_IDS = ['OLK_SRC_BODY_SECTION']
diff --git a/talon/quotations.py b/talon/quotations.py
index d64f4c87..8da4507d 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -489,7 +489,7 @@ def _extract_from_html(msg_body):
     if _readable_text_empty(html_tree_copy):
         return msg_body
 
-    return html.tostring(html_tree_copy)
+    return _html_tostring(html_tree_copy)
 
 
 def split_emails(msg):
@@ -598,3 +598,6 @@ def register_xpath_extensions():
     ns.prefix = 'mg'
     ns['text_content'] = text_content
     ns['tail'] = tail
+
+def _html_tostring(html_tree):
+  return html.tostring(html_tree).decode('utf-8')
diff --git a/talon/signature/bruteforce.py b/talon/signature/bruteforce.py
index 7f666bd9..bfc72f2d 100644
--- a/talon/signature/bruteforce.py
+++ b/talon/signature/bruteforce.py
@@ -50,7 +50,7 @@
 # c - could be signature line
 # d - line starts with dashes (could be signature or list item)
 # l - long line
-RE_SIGNATURE_CANDIDATE = re.compile(r'''
+RE_SIGNATURE_CANDIDATE = re.compile(br'''
     (?P<candidate>c+d)[^d]
     |
     (?P<candidate>c+d)$
@@ -163,16 +163,16 @@ def _mark_candidate_indexes(lines, candidate):
     'cdc'
     """
     # at first consider everything to be potential signature lines
-    markers = bytearray('c'*len(candidate))
+    markers = bytearray('c'*len(candidate), 'utf-8')
 
     # mark lines starting from bottom up
     for i, line_idx in reversed(list(enumerate(candidate))):
         if len(lines[line_idx].strip()) > TOO_LONG_SIGNATURE_LINE:
-            markers[i] = 'l'
+            markers[i] = ord(b'l')
         else:
             line = lines[line_idx].strip()
             if line.startswith('-') and line.strip("-"):
-                markers[i] = 'd'
+                markers[i] = ord(b'd')
 
     return markers
 
diff --git a/talon/signature/extraction.py b/talon/signature/extraction.py
index 32591717..20263285 100644
--- a/talon/signature/extraction.py
+++ b/talon/signature/extraction.py
@@ -18,7 +18,7 @@
 
 # regex signature pattern for reversed lines
 # assumes that all long lines have been excluded
-RE_REVERSE_SIGNATURE = re.compile(r'''
+RE_REVERSE_SIGNATURE = re.compile(br'''
 # signature should consists of blocks like this
 (?:
    # it could end with empty line
@@ -81,7 +81,7 @@ def _mark_lines(lines, sender):
     candidate = get_signature_candidate(lines)
 
     # at first consider everything to be text no signature
-    markers = bytearray('t'*len(lines))
+    markers = bytearray('t'*len(lines), 'utf-8')
 
     # mark lines starting from bottom up
     # mark only lines that belong to candidate
@@ -92,9 +92,9 @@ def _mark_lines(lines, sender):
         # relative to lines not candidate
         j = len(lines) - len(candidate) + i
         if not line.strip():
-            markers[j] = 'e'
+            markers[j] = ord(b'e')
         elif is_signature_line(line, sender, EXTRACTOR):
-            markers[j] = 's'
+            markers[j] = ord(b's')
 
     return markers
 
diff --git a/talon/utils.py b/talon/utils.py
index a7d81e4f..46e23259 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -115,21 +115,7 @@ def get_delimiter(msg_body):
     return delimiter
 
 
-def html_to_text(string):
-    """
-    Dead-simple HTML-to-text converter:
-        >>> html_to_text("one<br>two<br>three")
-        >>> "one\ntwo\nthree"
-
-    NOTES:
-        1. the string is expected to contain UTF-8 encoded HTML!
-        2. returns utf-8 encoded str (not unicode)
-    """
-    s = _prepend_utf8_declaration(string)
-    s = s.replace("\n", "")
-
-    tree = html.fromstring(s)
-
+def html_tree_to_text(tree):
     for style in CSSSelector('style')(tree):
         style.getparent().remove(style)
 
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index 3aca19be..9d5f294e 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -165,7 +165,7 @@ def test_unicode_in_reply():
 
 <blockquote>
   Quote
-</blockquote>""".encode("utf-8")
+</blockquote>"""
 
     eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
         "</body></html>",
@@ -373,8 +373,8 @@ def test_CRLF():
 </blockquote>"""
     msg_body = msg_body.replace('\n', '\r\n')
     extracted = quotations.extract_from_html(msg_body)
-    assert_false(symbol in extracted)    
-    # Keep new lines otherwise "My reply" becomes one word - "Myreply" 
+    assert_false(symbol in extracted)
+    # Keep new lines otherwise "My reply" becomes one word - "Myreply"
     eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
 
 
diff --git a/tests/signature/bruteforce_test.py b/tests/signature/bruteforce_test.py
index 9867901a..65b53242 100644
--- a/tests/signature/bruteforce_test.py
+++ b/tests/signature/bruteforce_test.py
@@ -200,14 +200,14 @@ def test_get_signature_candidate():
 def test_mark_candidate_indexes():
     with patch.object(bruteforce, 'TOO_LONG_SIGNATURE_LINE', 3):
         # spaces are not considered when checking line length
-        eq_('clc',
+        eq_(b'clc',
             bruteforce._mark_candidate_indexes(
                 ['BR,  ', 'long', 'Bob'],
                 [0, 1, 2]))
 
         # only candidate lines are marked
         # if line has only dashes it's a candidate line
-        eq_('ccdc',
+        eq_(b'ccdc',
             bruteforce._mark_candidate_indexes(
                 ['-', 'long', '-', '- i', 'Bob'],
                 [0, 2, 3, 4]))
@@ -216,20 +216,20 @@ def test_mark_candidate_indexes():
 def test_process_marked_candidate_indexes():
     eq_([2, 13, 15],
         bruteforce._process_marked_candidate_indexes(
-            [2, 13, 15], 'dcc'))
+            [2, 13, 15], b'dcc'))
 
     eq_([15],
         bruteforce._process_marked_candidate_indexes(
-            [2, 13, 15], 'ddc'))
+            [2, 13, 15], b'ddc'))
 
     eq_([13, 15],
         bruteforce._process_marked_candidate_indexes(
-            [13, 15], 'cc'))
+            [13, 15], b'cc'))
 
     eq_([15],
         bruteforce._process_marked_candidate_indexes(
-            [15], 'lc'))
+            [15], b'lc'))
 
     eq_([15],
         bruteforce._process_marked_candidate_indexes(
-            [13, 15], 'ld'))
+            [13, 15], b'ld'))
diff --git a/tests/signature/extraction_test.py b/tests/signature/extraction_test.py
index 0cbcb46e..1720100e 100644
--- a/tests/signature/extraction_test.py
+++ b/tests/signature/extraction_test.py
@@ -135,7 +135,7 @@ def test_mark_lines():
     with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 2):
         # we analyse the 2nd line as well though it's the 6th line
         # (starting from the bottom) because we don't count empty line
-        eq_('ttset',
+        eq_(b'ttset',
             e._mark_lines(['Bob Smith',
                           'Bob Smith',
                           'Bob Smith',
@@ -145,7 +145,7 @@ def test_mark_lines():
     with patch.object(bruteforce, 'SIGNATURE_MAX_LINES', 3):
         # we don't analyse the 1st line because
         # signature cant start from the 1st line
-        eq_('tset',
+        eq_(b'tset',
             e._mark_lines(['Bob Smith',
                           'Bob Smith',
                           '',
@@ -154,20 +154,20 @@ def test_mark_lines():
 
 def test_process_marked_lines():
     # no signature found
-    eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), 'telt'))
+    eq_((list(range(5)), None), e._process_marked_lines(list(range(5)), b'telt'))
 
     # signature in the middle of the text
-    eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), 'tesestelt'))
+    eq_((list(range(9)), None), e._process_marked_lines(list(range(9)), b'tesestelt'))
 
     # long line splits signature
     eq_((list(range(7)), [7, 8]),
-        e._process_marked_lines(list(range(9)), 'tsslsless'))
+        e._process_marked_lines(list(range(9)), b'tsslsless'))
 
     eq_((list(range(20)), [20]),
-        e._process_marked_lines(list(range(21)), 'ttttttstttesllelelets'))
+        e._process_marked_lines(list(range(21)), b'ttttttstttesllelelets'))
 
     # some signature lines could be identified as text
-    eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), 'tsetetest'))
+    eq_(([0], list(range(1, 9))), e._process_marked_lines(list(range(9)), b'tsetetest'))
 
     eq_(([], list(range(5))),
-        e._process_marked_lines(list(range(5)), "ststt"))
+        e._process_marked_lines(list(range(5)), b"ststt"))
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index a6e33574..d21ea051 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -536,35 +536,34 @@ def test_mark_message_lines():
 def test_process_marked_lines():
     # quotations and last message lines are mixed
     # consider all to be a last message
-    markers = 'tsemmtetm'
-    lines = [str(i) for i in range(len(markers))]
+    markers = b'tsemmtetm'
     lines = [str(i) for i in range(len(markers))]
 
     eq_(lines, quotations.process_marked_lines(lines, markers))
 
     # no splitter => no markers
-    markers = 'tmm'
+    markers = b'tmm'
     lines = ['1', '2', '3']
     eq_(['1', '2', '3'], quotations.process_marked_lines(lines, markers))
 
     # text after splitter without markers is quotation
-    markers = 'tst'
+    markers = b'tst'
     lines = ['1', '2', '3']
     eq_(['1'], quotations.process_marked_lines(lines, markers))
 
     # message + quotation + signature
-    markers = 'tsmt'
+    markers = b'tsmt'
     lines = ['1', '2', '3', '4']
     eq_(['1', '4'], quotations.process_marked_lines(lines, markers))
 
     # message + <quotation without markers> + nested quotation
-    markers = 'tstsmt'
+    markers = b'tstsmt'
     lines = ['1', '2', '3', '4', '5', '6']
     eq_(['1'], quotations.process_marked_lines(lines, markers))
 
     # test links wrapped with paranthesis
     # link starts on the marker line
-    markers = 'tsmttem'
+    markers = b'tsmttem'
     lines = ['text',
              'splitter',
              '>View (http://example.com',
@@ -575,7 +574,7 @@ def test_process_marked_lines():
     eq_(lines[:1], quotations.process_marked_lines(lines, markers))
 
     # link starts on the new line
-    markers = 'tmmmtm'
+    markers = b'tmmmtm'
     lines = ['text',
              '>'
              '>',
@@ -586,7 +585,7 @@ def test_process_marked_lines():
     eq_(lines[:1], quotations.process_marked_lines(lines, markers))
 
     # check all "inline" replies
-    markers = 'tsmtmtm'
+    markers = b'tsmtmtm'
     lines = ['text',
              'splitter',
              '>',
@@ -597,7 +596,7 @@ def test_process_marked_lines():
     eq_(lines, quotations.process_marked_lines(lines, markers))
 
     # inline reply with link not wrapped in paranthesis
-    markers = 'tsmtm'
+    markers = b'tsmtm'
     lines = ['text',
              'splitter',
              '>',
@@ -606,7 +605,7 @@ def test_process_marked_lines():
     eq_(lines, quotations.process_marked_lines(lines, markers))
 
     # inline reply with link wrapped in paranthesis
-    markers = 'tsmtm'
+    markers = b'tsmtm'
     lines = ['text',
              'splitter',
              '>',

From 3b2f39cbcd2ed5a8ca78e40fd657d12b149c19a8 Mon Sep 17 00:00:00 2001
From: Samiur Rahman <me@samiurr.com>
Date: Sun, 16 Apr 2017 13:49:31 -0700
Subject: [PATCH 5/6] Fix up bytes

---
 talon/quotations.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 8da4507d..2fba83fd 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -221,15 +221,15 @@ def mark_message_lines(lines):
     >>> mark_message_lines(['answer', 'From: foo@bar.com', '', '> question'])
     'tsem'
     """
-    markers = ['e' for _ in lines]
+    markers = [b'e' for _ in lines]
     i = 0
     while i < len(lines):
         if not lines[i].strip():
-            markers[i] = ord('e')  # empty line
+            markers[i] = b'e'  # empty line
         elif QUOT_PATTERN.match(lines[i]):
-            markers[i] = ord('m')  # line with quotation marker
+            markers[i] = b'm'  # line with quotation marker
         elif RE_FWD.match(lines[i]):
-            markers[i] = ord('f')  # ---- Forwarded message ----
+            markers[i] = b'f'  # ---- Forwarded message ----
         else:
             # in case splitter is spread across several lines
             splitter = is_splitter('\n'.join(lines[i:i + SPLITTER_MAX_LINES]))
@@ -238,16 +238,16 @@ def mark_message_lines(lines):
                 # append as many splitter markers as lines in splitter
                 splitter_lines = splitter.group().splitlines()
                 for j in range(len(splitter_lines)):
-                    markers[i + j] = ord('s')
+                    markers[i + j] = b's'
 
                 # skip splitter lines
                 i += len(splitter_lines) - 1
             else:
                 # probably the line from the last message in the conversation
-                markers[i] = ord('t')
+                markers[i] = b't'
         i += 1
 
-    return ''.join(markers)
+    return b''.join(markers)
 
 
 def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
@@ -261,7 +261,6 @@ def process_marked_lines(lines, markers, return_flags=[False, -1, -1]):
     return_flags = [were_lines_deleted, first_deleted_line,
                     last_deleted_line]
     """
-    markers = ''.join(markers)
     # if there are no splitter there should be no markers
     if b's' not in markers and not re.search(b'(me*){3}', markers):
         markers = markers.replace(b'm', b't')

From a6a77af94cdb42b069b23bf17ef654627088ac01 Mon Sep 17 00:00:00 2001
From: Samiur Rahman <me@samiurr.com>
Date: Mon, 24 Apr 2017 22:19:41 -0700
Subject: [PATCH 6/6] Fix most tests

---
 talon/quotations.py           | 21 ++++++++++++---------
 talon/utils.py                | 15 ++++++++++-----
 tests/html_quotations_test.py | 16 ++++++++--------
 tests/text_quotations_test.py |  2 +-
 tests/utils_test.py           |  4 ++--
 5 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/talon/quotations.py b/talon/quotations.py
index 2fba83fd..f540a729 100644
--- a/talon/quotations.py
+++ b/talon/quotations.py
@@ -140,7 +140,7 @@
     ))), re.I)
 
 # ---- John Smith wrote ----
-RE_ANDROID_WROTE = re.compile(u'[\s]*[-]+.*({})[ ]*[-]+'.format(
+RE_ANDROID_WROTE = re.compile('[\s]*[-]+.*({})[ ]*[-]+'.format(
     u'|'.join((
         # English
         'wrote'
@@ -183,6 +183,7 @@
 RE_HEADER = re.compile(": ")
 
 
+
 def extract_from(msg_body, content_type='text/plain'):
     try:
         if content_type == 'text/plain':
@@ -410,7 +411,6 @@ def extract_from_html(msg_body):
 
     return result
 
-
 def _extract_from_html(msg_body):
     """
     Extract not quoted message from provided html message body
@@ -524,43 +524,46 @@ def _mark_quoted_email_splitlines(markers, lines):
     """
     # Create a list of markers to easily alter specific characters
     markerlist = list(markers)
+
     for i, line in enumerate(lines):
-        if markerlist[i] != 'm':
+        if markerlist[i] != b'm'[0]:
             continue
         for pattern in SPLITTER_PATTERNS:
             matcher = re.search(pattern, line)
             if matcher:
-                markerlist[i] = 's'
+                markerlist[i] = b's'[0]
                 break
 
-    return "".join(markerlist)
+    return bytes(markerlist)
 
 
 def _correct_splitlines_in_headers(markers, lines):
     """
     Corrects markers by removing splitlines deemed to be inside header blocks.
     """
-    updated_markers = ""
+    updated_markers = b""
     i = 0
     in_header_block = False
 
     for m in markers:
         # Only set in_header_block flag when we hit an 's' and line is a header
-        if m == 's':
+        m = bytes([m])
+        if m == b"s":
             if not in_header_block:
                 if bool(re.search(RE_HEADER, lines[i])):
                     in_header_block = True
             else:
                 if QUOT_PATTERN.match(lines[i]):
-                    m = 'm'
+                    m = b"m"
                 else:
-                    m = 't'
+                    m = b"t"
 
         # If the line is not a header line, set in_header_block false.
         if not bool(re.search(RE_HEADER, lines[i])):
             in_header_block = False
 
         # Add the marker to the new updated markers string.
+        print(updated_markers, m)
         updated_markers += m
         i += 1
 
diff --git a/talon/utils.py b/talon/utils.py
index 46e23259..a4dfb3bd 100644
--- a/talon/utils.py
+++ b/talon/utils.py
@@ -7,6 +7,7 @@
 import cchardet
 import regex as re
 
+import lxml.html
 from lxml.html import html5parser
 from lxml.cssselect import CSSSelector
 
@@ -177,11 +178,13 @@ def html_to_text(string):
 def html_fromstring(s):
     """Parse html tree from string. Return None if the string can't be parsed.
     """
+    if isinstance(s, bytes):
+        s = s.decode()
     try:
         if html_too_big(s):
             return None
 
-        return html5parser.fromstring(s, parser=_html5lib_parser())
+        return lxml.html.document_fromstring(s, ensure_head_body=True) #html5parser.fromstring(s, parser=_html5lib_parser())
     except Exception:
         pass
 
@@ -189,11 +192,13 @@ def html_fromstring(s):
 def html_document_fromstring(s):
     """Parse html tree from string. Return None if the string can't be parsed.
     """
+    if isinstance(s, bytes):
+        s = s.decode()
     try:
         if html_too_big(s):
             return None
 
-        return html5parser.document_fromstring(s, parser=_html5lib_parser())
+        return lxml.html.document_fromstring(s, ensure_head_body=True) #html5parser.document_fromstring(s, parser=_html5lib_parser())
     except Exception:
         pass
 
@@ -209,7 +214,7 @@ def html_too_big(s):
 def _contains_charset_spec(s):
     """Return True if the first 4KB contain charset spec
     """
-    return s.lower().find('html; charset=', 0, 4096) != -1
+    return s.lower().find(b'html; charset=', 0, 4096) != -1
 
 
 def _prepend_utf8_declaration(s):
@@ -245,8 +250,8 @@ def _html5lib_parser():
     )
 
 
-_UTF8_DECLARATION = ('<meta http-equiv="Content-Type" content="text/html;'
-                     'charset=utf-8">')
+_UTF8_DECLARATION = (b'<meta http-equiv="Content-Type" content="text/html;'
+                     b'charset=utf-8">')
 
 
 _BLOCKTAGS  = ['div', 'p', 'ul', 'li', 'h1', 'h2', 'h3']
diff --git a/tests/html_quotations_test.py b/tests/html_quotations_test.py
index 9d5f294e..3f7c4389 100644
--- a/tests/html_quotations_test.py
+++ b/tests/html_quotations_test.py
@@ -27,7 +27,7 @@ def test_quotation_splitter_inside_blockquote():
 
 </blockquote>"""
 
-    eq_("<html><head></head><body>Reply</body></html>",
+    eq_("<html><head></head><body><p>Reply</p></body></html>",
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
 
@@ -44,7 +44,7 @@ def test_quotation_splitter_outside_blockquote():
   </div>
 </blockquote>
 """
-    eq_("<html><head></head><body>Reply</body></html>",
+    eq_("<html><head></head><body><p>Reply</p></body></html>",
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
 
@@ -62,7 +62,7 @@ def test_regular_blockquote():
   </div>
 </blockquote>
 """
-    eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
+    eq_("<html><head></head><body><p>Reply</p><blockquote>Regular</blockquote></body></html>",
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
 
@@ -129,7 +129,7 @@ def test_gmail_quote():
     </div>
   </div>
 </div>"""
-    eq_("<html><head></head><body>Reply</body></html>",
+    eq_("<html><head></head><body><p>Reply</p></body></html>",
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
 
@@ -140,7 +140,7 @@ def test_gmail_quote_compact():
                '<div>Test</div>' \
                '</div>' \
                '</div>'
-    eq_("<html><head></head><body>Reply</body></html>",
+    eq_("<html><head></head><body><p>Reply</p></body></html>",
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
 
@@ -167,7 +167,7 @@ def test_unicode_in_reply():
   Quote
 </blockquote>"""
 
-    eq_("<html><head></head><body>Reply&#160;&#160;Text<br><div><br></div>"
+    eq_("<html><head></head><body><p>Reply&#160;&#160;Text<br></p><div><br></div>"
         "</body></html>",
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
@@ -298,7 +298,7 @@ def test_from_block_and_quotations_in_separate_divs():
   </div>
 </div>
 '''
-    eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
+    eq_('<html><head></head><body><p>Reply</p><div><hr></div></body></html>',
         RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))
 
 
@@ -375,7 +375,7 @@ def test_CRLF():
     extracted = quotations.extract_from_html(msg_body)
     assert_false(symbol in extracted)
     # Keep new lines otherwise "My reply" becomes one word - "Myreply"
-    eq_("<html><head></head><body>My\nreply\n</body></html>", extracted)
+    eq_("<html><head></head><body><p>My\nreply\n</p></body></html>", extracted)
 
 
 def test_gmail_forwarded_msg():
diff --git a/tests/text_quotations_test.py b/tests/text_quotations_test.py
index d21ea051..d6722369 100644
--- a/tests/text_quotations_test.py
+++ b/tests/text_quotations_test.py
@@ -755,6 +755,6 @@ def test_split_email():
         >
         >
 """
-    expected_markers = "stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
+    expected_markers = b"stttttsttttetesetesmmmmmmssmmmmmmsmmmmmmmm"
     markers = quotations.split_emails(msg)
     eq_(markers, expected_markers)
diff --git a/tests/utils_test.py b/tests/utils_test.py
index 1003f7b5..138338ab 100644
--- a/tests/utils_test.py
+++ b/tests/utils_test.py
@@ -115,7 +115,7 @@ def test_html_to_text():
 def test_comment_no_parent():
     s = "<!-- COMMENT 1 --> no comment"
     d = u.html_document_fromstring(s)
-    eq_("no comment", u.html_tree_to_text(d))
+    eq_(b"no comment", u.html_tree_to_text(d))
 
 
 @patch.object(u.html5parser, 'fromstring', Mock(side_effect=Exception()))
@@ -156,5 +156,5 @@ def test_html_too_big():
 
 @patch.object(u, '_MAX_TAGS_COUNT', 3)
 def test_html_to_text():
-    eq_("Hello", u.html_to_text("<div>Hello</div>"))
+    eq_(b"Hello", u.html_to_text("<div>Hello</div>"))
     eq_(None, u.html_to_text("<div><span>Hi</span></div>"))