-
Notifications
You must be signed in to change notification settings - Fork 0
/
sh.py
165 lines (148 loc) · 8.01 KB
/
sh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env python3
# author: aaron792
import re
import pandas as pd
import requests
import grequests
import datetime
import csv
import json
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")
class SH(object):
def __init__(self, cookies):
"""
:param cookies: 您的cookies
"""
self.cookies = cookies
def date_ranges(self):
begin = datetime.datetime(1990, 11, 26)
now = datetime.datetime.today()
interv = datetime.timedelta(days=900)
dates = []
date = begin
while True:
if (date < now) & (date + interv < now):
date = date + interv
dates.append(date.strftime('%Y-%m-%d'))
else:
dates.append(now.strftime('%Y-%m-%d'))
break
return [(d1, d2) for d1, d2 in zip(dates, dates[1:])]
def companys(self):
"""
上证所有上市公司名录,公司名及股票代码
:return: 返回DataFrame
"""
stocks = []
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'http://www.sse.com.cn/assortment/stock/list/share/'}
url = 'http://query.sse.com.cn/security/stock/getStockListData2.do?&jsonCallBack=jsonpCallback13284&isPagination=true&stockCode=&csrcCode=&areaName=&stockType=1&pageHelp.cacheSize=1&pageHelp.beginPage=1&pageHelp.pageSize=25&pageHelp.pageNo=2&pageHelp.endPage=21'
resp = requests.get(url, headers=headers, cookies=self.cookies)
raw_json = re.split('Callback\d+\(', resp.text)[-1][:-1]
pages = json.loads(raw_json)['pageHelp']['pageCount']
for page in range(1, pages + 1):
base = 'http://query.sse.com.cn/security/stock/getStockListData2.do?&jsonCallBack=jsonpCallback13284&isPagination=true&stockCode=&csrcCode=&areaName=&stockType=1&pageHelp.cacheSize=1&pageHelp.beginPage={page}&pageHelp.pageSize=25&pageHelp.pageNo=2&pageHelp.endPage=21'
pageurl = base.format(page=page)
pageresp = requests.get(pageurl, headers=headers, cookies=self.cookies)
page_raw_json = re.split('Callback\d+\(', pageresp.text)[-1][:-1]
results = json.loads(page_raw_json)['result']
results = [[r['COMPANY_ABBR'], r['COMPANY_CODE']] for r in results]
stocks.extend(results)
df = pd.DataFrame(stocks, columns=['name', 'code'])
return df
def disclosure(self, code):
"""
获得该公司的股票代码、报告类型、年份、定期报告披露日期、定期报告pdf下载链接,返回DataFrame。
:param code: 股票代码
:return: 返回DataFrame
"""
print('正在获取{}定期报告披露信息'.format(code))
datas = []
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'http://www.sse.com.cn/disclosure/listedinfo/regular/'}
base = 'http://query.sse.com.cn/security/stock/queryCompanyBulletin.do?isPagination=true&productId={code}&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType=&beginDate={beginDate}&endDate={endDate}&pageHelp.pageSize=25&pageHelp.pageCount=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=5'
base='http://query.sse.com.cn/security/stock/queryCompanyBulletin.do?jsonCallBack=jsonpCallback92897&isPagination=true&productId={code}&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType=&beginDate={beginDate}&endDate={endDate}&pageHelp.pageSize=25&pageHelp.pageCount=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=5&_=1632748934222'
dateranges = self.date_ranges()
for begin, end in dateranges:
url = base.format(code=code, beginDate=begin, endDate=end)
print(url)
resp = requests.get(url, headers=headers, cookies=self.cookies)
print(resp.text)
page_raw_json = re.split('Callback\d+\(', resp.text)[-1][:-1]
print(page_raw_json)
#raw_data = json.loads(resp.text)
raw_data = json.loads(page_raw_json)
results = raw_data['result']
for result in results:
pdf = 'http://www.sse.com.cn' + result['URL']
print(pdf)
print(re.search('\d{6}', pdf))
#if re.search('\d{6}_\d{4}_[13nz].pdf', pdf):
if re.search('\d{6}', pdf):
#company = re.sub('[半年报|第三季度季报|第一季度季报|年报]', '', result['TITLE'])
company=result['SECURITY_CODE']
_type = result['BULLETIN_TYPE']
year = result['BULLETIN_YEAR']
date = result['SSEDATE']
data = [company, code, _type, year, date, pdf]
datas.append(data)
df = pd.DataFrame(datas, columns=['company','code', 'type', 'year', 'date', 'pdf'])
return df
def pdfurls(self, code):
"""
仅获取定期报告pdf下载链接
:param code: 股票代码
:return: 年报pdf链接
"""
print('准备获取{}年报文件链接'.format(code))
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'http://www.sse.com.cn/disclosure/listedinfo/regular/'}
URLs = []
base = 'http://query.sse.com.cn/security/stock/queryCompanyBulletin.do?isPagination=true&productId={code}&securityType=0101%2C120100%2C020100%2C020200%2C120200&reportType2=DQBG&reportType=&beginDate={beginDate}&endDate={endDate}&pageHelp.pageSize=25&pageHelp.pageCount=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=5'
dateranges = self.date_ranges()
for begin, end in dateranges:
url = base.format(code=code, beginDate=begin, endDate=end)
print(url)
resp = requests.get(url, headers=headers, cookies=self.cookies)
print(resp.text)
raw_data = json.loads(resp.text)
results = raw_data['result']
for result in results:
URL = 'http://www.sse.com.cn' + result['URL']
if re.search('\d{6}_\d{4}_[13nz].pdf', URL):
URLs.append(URL)
else:
continue
print(' ',URLs)
return URLs
def download(self, code, savepath):
"""
下载该公司(code)的所有季度报告、半年报、年报pdf文件
:param code: 上市公司股票代码
:param savepath: 数据存储所在文件夹的路径,建议使用相对路径
:return:
"""
path = Path(savepath).joinpath(*('disclosure', 'reports', str(code)))
Path(path).mkdir(parents=True, exist_ok=True)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'Referer': 'http://www.sse.com.cn/disclosure/listedinfo/regular/'}
urls = self.pdfurls(code=code)
print(grequests.request("GET", url=urls[0], headers=headers, cookies=self.cookies))
tasks = [grequests.request("GET", url=url, headers=headers, cookies=self.cookies) for url in urls]
results = grequests.map(tasks)
print(results)
print(results[0])
for result in results:
try:
pdfname = result.url.split('/')[-1]
print(result.url)
print(pdfname)
pdfpath = path.joinpath(pdfname)
with open(pdfpath, 'wb') as f:
f.write(result.content)
print(' 已成功下载{}'.format(pdfname))
except:
pass