-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
125 lines (104 loc) · 4.82 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from selenium import webdriver
from requests_html import AsyncHTMLSession
from pyquery import PyQuery
from bs4 import BeautifulSoup
from selenium.webdriver.support.wait import WebDriverWait
import psycopg2
import os
import asyncio
import re
def main():
option = webdriver.ChromeOptions()
option.binary_location = os.getenv('GOOGLE_CHROME_BIN')
option.add_argument("--headless")
option.add_argument('--disable-gpu')
option.add_argument('--no-sandbox')
driver = webdriver.Chrome(executable_path=os.getenv('CHROMEDRIVER_PATH'), options=option)
wait_for_tag = lambda tag: WebDriverWait(driver,30).until(lambda x: x.find_element_by_css_selector(tag))
USERNAME = os.getenv('GCJOBS_USERNAME')
PASSWORD = os.getenv('GCJOBS_PASSWORD')
DOMAIN = 'https://emploisfp-psjobs.cfp-psc.gc.ca'
CACHE = []
try:
#login
is_authenticated = False
try:
driver.get('https://emploisfp-psjobs.cfp-psc.gc.ca/psrs-srfp/applicant/page1710')
driver.find_element_by_id('UserNumber').send_keys(USERNAME)
driver.find_element_by_id('Password').send_keys(PASSWORD)
driver.find_element_by_css_selector('input[name=LOGIN]').click()
is_authenticated = True
except Exception as e:
print(e)
tabs = (1,2) if is_authenticated else (1,)
for tab in tabs:
driver.get(f'{DOMAIN}/psrs-srfp/applicant/page2440?requestedPage=1&tab={tab}')
wait_for_tag('span.pagelinks')
total_pages = int(re.findall('of (\d+)',driver.page_source)[0])
for n in range(1,total_pages+1):
driver.get(f'{DOMAIN}/psrs-srfp/applicant/page2440?requestedPage={n}&tab={tab}')
#wait_for_tag('span.pagelinks')
soup = BeautifulSoup(driver.page_source,'html.parser')
results = soup.select('li.searchResult')
for row in results:
x = row.select_one('a')
url,title = x.attrs.get('href'), x.text
date,dept,*loc = list(row.select_one('div.tableCell').stripped_strings)
CACHE.append({'url': f'{DOMAIN}{url}',
'title': title,
'closing': date.replace('Closing date: ',''),
'department': dept.split(' - ')[0].strip(),
'location': ','.join(loc),
'internal': 1 if tab==1 and is_authenticated else 0
}
)
except Exception as e:
print(e)
finally:
driver.quit()
print('selenium finished running')
async def scrape(s,d):
try:
url = d.get('url')
r = await s.get(url)
text = r.html.find('body',first=True).text
test = "The job opportunity you have selected requires the Public Service Commission (PSC) to transfer you"
if test in text:
anchor = r.html.find('main.container > p > a', first=True)
url = anchor.attrs.get('href')
r = await s.get(url)
pq = PyQuery(r.content)
pq('script').remove()
text = pq('body').text()
except Exception as e:
text = ''
finally:
d['url'] = url
#possibly redundant adding the title,dept,loc, but ensures it always exists for search
d['text'] = text + d['title']+d['department']+d['location']
return d
async def main():
s = AsyncHTMLSession()
data = {'UserNumber': USERNAME, 'Password': PASSWORD, 'LOGIN': 'Login'}
await s.post('https://emploisfp-psjobs.cfp-psc.gc.ca/psrs-srfp/applicant/page1710', data=data)
tasks = (scrape(s,d) for d in CACHE)
return await asyncio.gather(*tasks)
asyncio.run(main())
print('async finished running')
ROWS,SEEN = [],set()
for d in CACHE:
key = (d['url'],d['title'])
if key not in SEEN:
ROWS.append(d)
SEEN.add(key)
DATABASE_URL = os.getenv('DATABASE_URL')
if ROWS:
with psycopg2.connect(DATABASE_URL, sslmode='require') as conn:
c = conn.cursor()
c.execute('drop table if exists job;')
c.execute('CREATE TABLE if not exists job (id serial primary key, url text, title text, closing text, department text, location text, internal smallint, contents text);')
c.executemany('insert into job(url, title, closing, department, location, internal, contents) values (%s,%s,%s,%s,%s,%s,%s)', (tuple(d.values()) for d in ROWS))
conn.commit()
print('data written to postgres')
else:
print('no data found to write to postgres')