-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl_data.py
34 lines (32 loc) · 1018 Bytes
/
crawl_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import os
from dataclasses import asdict
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from config import CrawlerConfig
from utils import get_args
from crawl import AutoCrawler
from database import USTCHBase
args = get_args(asdict(CrawlerConfig()))
crawlers = args.crawlers
print(crawlers)
if args.use_hbase:
hbase = USTCHBase(host='localhost')
else:
hbase = None
args.hbase = hbase
if args.multi_threads:
with ThreadPoolExecutor(len(crawlers)) as t:
for name in crawlers:
crawler = AutoCrawler.from_name(name)(args)
t.submit(
partial(
crawler.crawl_src,
save_path=os.path.join(args.save_dir, name),
host_url=crawler.main_url
),
)
else:
for name in crawlers:
print(name)
crawler = AutoCrawler.from_name(name)(args)
crawler.crawl_src(save_path=os.path.join(args.save_dir, name), host_url=crawler.main_url)