-
Notifications
You must be signed in to change notification settings - Fork 528
/
crawler.py
163 lines (140 loc) · 4.59 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# coding=utf-8
from __future__ import unicode_literals
import logging
import os
import re
import time
try:
from urllib.parse import urlparse # py3
except:
from urlparse import urlparse # py2
import pdfkit
import requests
from bs4 import BeautifulSoup
html_template = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
</head>
<body>
{content}
</body>
</html>
"""
class Crawler(object):
"""
爬虫基类,所有爬虫都应该继承此类
"""
name = None
def __init__(self, name, start_url):
"""
初始化
:param name: 将要被保存为PDF的文件名称
:param start_url: 爬虫入口URL
"""
self.name = name
self.start_url = start_url
self.domain = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(self.start_url))
@staticmethod
def request(url, **kwargs):
"""
网络请求,返回response对象
:return:
"""
response = requests.get(url, **kwargs)
return response
def parse_menu(self, response):
"""
从response中解析出所有目录的URL链接
"""
raise NotImplementedError
def parse_body(self, response):
"""
解析正文,由子类实现
:param response: 爬虫返回的response对象
:return: 返回经过处理的html正文文本
"""
raise NotImplementedError
def run(self):
start = time.time()
options = {
'page-size': 'Letter',
'margin-top': '0.75in',
'margin-right': '0.75in',
'margin-bottom': '0.75in',
'margin-left': '0.75in',
'encoding': "UTF-8",
'custom-header': [
('Accept-Encoding', 'gzip')
],
'cookie': [
('cookie-name1', 'cookie-value1'),
('cookie-name2', 'cookie-value2'),
],
'outline-depth': 10,
}
htmls = []
for index, url in enumerate(self.parse_menu(self.request(self.start_url))):
html = self.parse_body(self.request(url))
f_name = ".".join([str(index), "html"])
with open(f_name, 'wb') as f:
f.write(html)
htmls.append(f_name)
pdfkit.from_file(htmls, self.name + ".pdf", options=options)
for html in htmls:
os.remove(html)
total_time = time.time() - start
print(u"总共耗时:%f 秒" % total_time)
class LiaoxuefengPythonCrawler(Crawler):
"""
廖雪峰Python3教程
"""
def parse_menu(self, response):
"""
解析目录结构,获取所有URL目录列表
:param response 爬虫返回的response对象
:return: url生成器
"""
soup = BeautifulSoup(response.content, "html.parser")
menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
for li in menu_tag.find_all("li"):
url = li.a.get("href")
if not url.startswith("http"):
url = "".join([self.domain, url]) # 补全为全路径
yield url
def parse_body(self, response):
"""
解析正文
:param response: 爬虫返回的response对象
:return: 返回处理后的html文本
"""
try:
soup = BeautifulSoup(response.content, 'html.parser')
body = soup.find_all(class_="x-wiki-content")[0]
# 加入标题, 居中显示
title = soup.find('h4').get_text()
center_tag = soup.new_tag("center")
title_tag = soup.new_tag('h1')
title_tag.string = title
center_tag.insert(1, title_tag)
body.insert(1, center_tag)
html = str(body)
# body中的img标签的src相对路径的改成绝对路径
pattern = "(<img .*?src=\")(.*?)(\")"
def func(m):
if not m.group(2).startswith("http"):
rtn = "".join([m.group(1), self.domain, m.group(2), m.group(3)])
return rtn
else:
return "".join([m.group(1), m.group(2), m.group(3)])
html = re.compile(pattern).sub(func, html)
html = html_template.format(content=html)
html = html.encode("utf-8")
return html
except Exception as e:
logging.error("解析错误", exc_info=True)
if __name__ == '__main__':
start_url = "http://www.liaoxuefeng.com/wiki/0013739516305929606dd18361248578c67b8067c8c017b000"
crawler = LiaoxuefengPythonCrawler("廖雪峰Git", start_url)
crawler.run()