Feature/Fix: Optimize LinkedIn_to_PDF logic (#194)

* Dependencies: Add xhtml2pdf, pathvalidate. Change reportlab version from 4.0.4 to 3.6.13 for xhtml2pdf compatibility. * Fix: PDF will preserve HTML format. Also structured the file to enable reusable function.
srbhr · Oct 11, 2023 · bf6296f · bf6296f
1 parent 9d97fdd
commit bf6296f
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 62 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -23,7 +23,7 @@ floret==0.10.3
 fonttools==4.41.0
 gitdb==4.0.10
 GitPython==3.1.32
-htbuilder==0.6.1
+htbuilder==0.6.2
 idna==3.4
 importlib-metadata==6.8.0
 jellyfish==1.0.0
@@ -47,6 +47,7 @@ nltk==3.8.1
 numpy==1.25.1
 packaging==23.1
 pandas==2.0.3
+pathvalidate==3.2.0
 pathy==0.10.2
 Pillow==9.5.0
 plotly==5.15.0
@@ -82,11 +83,11 @@ spacy-legacy==3.0.12
 spacy-loggers==1.0.4
 srsly==2.4.6
 st-annotated-text==4.0.0
-streamlit==1.24.1
+streamlit==1.27.0
 streamlit-camera-input-live==0.2.0
 streamlit-card==0.0.61
 streamlit-embedcode==0.1.2
-streamlit-extras==0.2.7
+streamlit-extras==0.3.2
 streamlit-faker==0.0.2
 streamlit-image-coordinates==0.1.5
 streamlit-keyup==0.2.0
@@ -110,7 +111,8 @@ validators==0.20.0
 wasabi==1.1.2
 watchdog==3.0.0
 zipp==3.16.2
-reportlab==4.0.4
+reportlab==3.6.13
 easygui==0.98.3
 cohere~=4.19.2
-qdrant-client>=1.2
+qdrant-client>=1.2
+xhtml2pdf==0.2.11
diff --git a/scripts/LinkedinJobToPDF.py b/scripts/LinkedinJobToPDF.py
@@ -1,86 +1,75 @@
-from bs4 import BeautifulSoup
-import requests
-import easygui
-from reportlab.pdfgen import canvas
-from reportlab.lib.pagesizes import letter
+import logging
 from os import listdir
 from os.path import isfile, join
-import logging
 
+import easygui
+import requests
+from bs4 import BeautifulSoup
+from pathvalidate import sanitize_filename
+from xhtml2pdf import pisa
 
 '''
 This script takes a LinkedIn job posting URL
 and converts the description to a PDF file.
 The PDF file is saved in the Data/JobDescription folder.
-The name will be outputX.pdf, where X is the number of files in the folder.
+The name will be OrgName__Job Title_X.pdf, where X is the number of files in the folder.
 
-IMPORTANT: Make sure the URL is to the actuall job description,
+IMPORTANT: Make sure the URL is to the actual job description,
 and not the job search page.
 '''
 
 
-def split_string(s: str, max_len: int = 82) -> list[str]:
-    words = s.split()
-    lines = []
-    current_line = ""
+def linkedin_to_pdf(job_url: str):
 
-    for word in words:
-        if len(current_line) + len(word) + 1 > max_len:
-            lines.append(current_line.strip())
-            current_line = ""
-        current_line += word + " "
+    job_path = "Data/JobDescription/"
+    job_description = ""
+    files_number = len([f for f in listdir(job_path) if isfile(join(job_path, f))])
 
-    if current_line:
-        lines.append(current_line.strip())
+    try:
+        page = requests.get(job_url)
 
-    return lines
+        if page.status_code != 200:
+            print(f"Failed to retrieve the job posting at {job_url}. Status code: {page.status_code}")
+            return
 
+        # Parse the HTML content of the job posting using BeautifulSoup
+        soup = BeautifulSoup(page.text, 'html.parser')
 
-def linkedin_to_pdf():
-    url = easygui.enterbox("Enter the URL of the LinkedIn Job Posting:")
-    try:
-        page = requests.get(url)
-        content = page.text
+        # Find the job title element and get the text
+        job_title = soup.find('h1', {'class': 'topcard__title'}).text.strip()
 
-        soup = BeautifulSoup(content, "lxml")
+        # Find the organization name element (try both selectors)
+        organization_element = soup.find('span', {'class': 'topcard__flavor'})
 
-        description = (
-            soup.find("div", class_="show-more-less-html__markup")
-            .get_text(strip=True, separator="\n")
-            .split("Primary Location")[0]
-            .strip()
-        )
-        logging.info("Description: \n" + description)
+        if not organization_element:
+            organization_element = soup.find('a', {'class': 'topcard__org-name-link'})
 
-        return save_to_pdf(description)
-    except Exception as e:
-        logging.error(f"Could not get the description from the URL:\n{url}")
-        logging.error(e)
-        exit()
+        # Extract the organization name
+        organization = organization_element.text.strip()
 
+        # Find the job description element
+        job_description_element = soup.find('div', {'class': 'show-more-less-html__markup'})
 
-def save_to_pdf(description: str):
-    job_path = "Data/JobDescription/"
-    description = description.split("\n")
-    files_number = len([f for f in listdir(job_path) if isfile(join(job_path, f))])
-    file_name = f"output{files_number}.pdf"
+        # Extract the job description and concatenate its elements
+        if job_description_element:
+            for element in job_description_element.contents:
+                job_description += str(element)
 
-    c = canvas.Canvas(job_path+file_name, pagesize=letter)
+        # Set file_path and sanitize organization name and job title
+        file_path = f"{job_path}{sanitize_filename(organization + '__' + job_title)}_{files_number}.pdf"
 
-    y = 780
-    for value in description:
-        value = split_string(value)
-        for v in value:
-            if y < 20:
-                c.showPage()
-                y = 780
-            c.drawString(72, y, v)
-            y -= 20
+        # Create a PDF file and write the job description to it
+        with open(file_path, 'wb') as pdf_file:
+            pisa.CreatePDF(job_description, dest=pdf_file, encoding='utf-8')
 
-    c.save()
-    logging.info("PDF saved to Data/JobDescription/"+file_name)
+        logging.info("PDF saved to " + file_path)
 
-    return file_name[:-4]
+    except Exception as e:
+        logging.error(f"Could not get the description from the URL: {job_url}")
+        logging.error(e)
+        exit()
 
 
-linkedin_to_pdf()
+if __name__ == "__main__":
+    url = easygui.enterbox("Enter the URL of the LinkedIn Job Posting:").strip()
+    linkedin_to_pdf(url)