Skip to content

Commit

Permalink
Feature/Fix: Optimize LinkedIn_to_PDF logic (#194)
Browse files Browse the repository at this point in the history
* Dependencies: Add xhtml2pdf, pathvalidate. Change reportlab version from 4.0.4 to 3.6.13 for xhtml2pdf compatibility.

* Fix: PDF will preserve HTML format. Also structured the file to enable reusable function.
  • Loading branch information
imhalcyon authored Oct 11, 2023
1 parent 9d97fdd commit bf6296f
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 62 deletions.
12 changes: 7 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ floret==0.10.3
fonttools==4.41.0
gitdb==4.0.10
GitPython==3.1.32
htbuilder==0.6.1
htbuilder==0.6.2
idna==3.4
importlib-metadata==6.8.0
jellyfish==1.0.0
Expand All @@ -47,6 +47,7 @@ nltk==3.8.1
numpy==1.25.1
packaging==23.1
pandas==2.0.3
pathvalidate==3.2.0
pathy==0.10.2
Pillow==9.5.0
plotly==5.15.0
Expand Down Expand Up @@ -82,11 +83,11 @@ spacy-legacy==3.0.12
spacy-loggers==1.0.4
srsly==2.4.6
st-annotated-text==4.0.0
streamlit==1.24.1
streamlit==1.27.0
streamlit-camera-input-live==0.2.0
streamlit-card==0.0.61
streamlit-embedcode==0.1.2
streamlit-extras==0.2.7
streamlit-extras==0.3.2
streamlit-faker==0.0.2
streamlit-image-coordinates==0.1.5
streamlit-keyup==0.2.0
Expand All @@ -110,7 +111,8 @@ validators==0.20.0
wasabi==1.1.2
watchdog==3.0.0
zipp==3.16.2
reportlab==4.0.4
reportlab==3.6.13
easygui==0.98.3
cohere~=4.19.2
qdrant-client>=1.2
qdrant-client>=1.2
xhtml2pdf==0.2.11
103 changes: 46 additions & 57 deletions scripts/LinkedinJobToPDF.py
Original file line number Diff line number Diff line change
@@ -1,86 +1,75 @@
from bs4 import BeautifulSoup
import requests
import easygui
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import logging
from os import listdir
from os.path import isfile, join
import logging

import easygui
import requests
from bs4 import BeautifulSoup
from pathvalidate import sanitize_filename
from xhtml2pdf import pisa

'''
This script takes a LinkedIn job posting URL
and converts the description to a PDF file.
The PDF file is saved in the Data/JobDescription folder.
The name will be outputX.pdf, where X is the number of files in the folder.
The name will be OrgName__Job Title_X.pdf, where X is the number of files in the folder.
IMPORTANT: Make sure the URL is to the actuall job description,
IMPORTANT: Make sure the URL is to the actual job description,
and not the job search page.
'''


def split_string(s: str, max_len: int = 82) -> list[str]:
words = s.split()
lines = []
current_line = ""
def linkedin_to_pdf(job_url: str):

for word in words:
if len(current_line) + len(word) + 1 > max_len:
lines.append(current_line.strip())
current_line = ""
current_line += word + " "
job_path = "Data/JobDescription/"
job_description = ""
files_number = len([f for f in listdir(job_path) if isfile(join(job_path, f))])

if current_line:
lines.append(current_line.strip())
try:
page = requests.get(job_url)

return lines
if page.status_code != 200:
print(f"Failed to retrieve the job posting at {job_url}. Status code: {page.status_code}")
return

# Parse the HTML content of the job posting using BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')

def linkedin_to_pdf():
url = easygui.enterbox("Enter the URL of the LinkedIn Job Posting:")
try:
page = requests.get(url)
content = page.text
# Find the job title element and get the text
job_title = soup.find('h1', {'class': 'topcard__title'}).text.strip()

soup = BeautifulSoup(content, "lxml")
# Find the organization name element (try both selectors)
organization_element = soup.find('span', {'class': 'topcard__flavor'})

description = (
soup.find("div", class_="show-more-less-html__markup")
.get_text(strip=True, separator="\n")
.split("Primary Location")[0]
.strip()
)
logging.info("Description: \n" + description)
if not organization_element:
organization_element = soup.find('a', {'class': 'topcard__org-name-link'})

return save_to_pdf(description)
except Exception as e:
logging.error(f"Could not get the description from the URL:\n{url}")
logging.error(e)
exit()
# Extract the organization name
organization = organization_element.text.strip()

# Find the job description element
job_description_element = soup.find('div', {'class': 'show-more-less-html__markup'})

def save_to_pdf(description: str):
job_path = "Data/JobDescription/"
description = description.split("\n")
files_number = len([f for f in listdir(job_path) if isfile(join(job_path, f))])
file_name = f"output{files_number}.pdf"
# Extract the job description and concatenate its elements
if job_description_element:
for element in job_description_element.contents:
job_description += str(element)

c = canvas.Canvas(job_path+file_name, pagesize=letter)
# Set file_path and sanitize organization name and job title
file_path = f"{job_path}{sanitize_filename(organization + '__' + job_title)}_{files_number}.pdf"

y = 780
for value in description:
value = split_string(value)
for v in value:
if y < 20:
c.showPage()
y = 780
c.drawString(72, y, v)
y -= 20
# Create a PDF file and write the job description to it
with open(file_path, 'wb') as pdf_file:
pisa.CreatePDF(job_description, dest=pdf_file, encoding='utf-8')

c.save()
logging.info("PDF saved to Data/JobDescription/"+file_name)
logging.info("PDF saved to " + file_path)

return file_name[:-4]
except Exception as e:
logging.error(f"Could not get the description from the URL: {job_url}")
logging.error(e)
exit()


linkedin_to_pdf()
if __name__ == "__main__":
url = easygui.enterbox("Enter the URL of the LinkedIn Job Posting:").strip()
linkedin_to_pdf(url)

0 comments on commit bf6296f

Please sign in to comment.