-
Notifications
You must be signed in to change notification settings - Fork 35
/
data.py
65 lines (54 loc) · 2.11 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from io import BytesIO
from itertools import islice
from gzip import GzipFile
from dataclasses import dataclass, asdict
import json
import warcio
import os
import shutil
from typing import List, Iterable
from util import *
from tqdm import tqdm
#import readabilipy
#import html2text
from markdownify import markdownify
import requests
@dataclass(frozen=True)
class Document:
"""A document with a URL and content."""
url: str
content: str
def get_common_crawl_urls(snapshot: str = "CC-MAIN-2024-18") -> List[str]:
"""Return the list of all the WARC files in the latest crawl."""
download_file(f"https://data.commoncrawl.org/crawl-data/{snapshot}/warc.paths.gz", "var/warc.paths.gz")
with GzipFile("var/warc.paths.gz") as f:
urls = ["https://data.commoncrawl.org/" + line.decode("utf-8").rstrip() for line in f]
return urls
def read_common_crawl(url: str, limit: int) -> Iterable[Document]:
"""Return the list of at most `limit` documents in the WARC file at `url`."""
# Download the contents of the first URL
path = os.path.join("var", os.path.basename(url))
download_file(url, path)
num_documents = 0
for record in warcio.ArchiveIterator(open(path, "rb")):
if num_documents >= limit:
break
if record.rec_type == "response":
url = record.rec_headers.get_header("WARC-Target-URI")
content_bytes = record.content_stream().read()
try:
content = content_bytes.decode("utf-8")
except UnicodeDecodeError:
continue
num_documents += 1
yield Document(url, content)
def preprocess(documents: Iterable[Document]) -> Iterable[Document]:
for document in documents:
markdown = markdownify(document.content)
yield Document(url=document.url, content=markdown)
def write_documents(documents: Iterable[Document], path: str):
with open(path, "w") as out:
for i, document in enumerate(documents):
print(f"--- PAGE {i}: url = {document.url}", file=out)
print(document.content, file=out)
print("", file=out)