-
Notifications
You must be signed in to change notification settings - Fork 14
/
update_dataset.py
40 lines (31 loc) · 1.06 KB
/
update_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import git
import os
import pandas as pd
import pandas as pd
from glob import glob
from datasets import load_dataset, Dataset, DatasetDict
from tqdm import tqdm
REPO_URL = "https://huggingface.co/datasets/taesiri/arxiv_qa2"
REPO_PATH = "./arxiv_qa2"
if os.path.exists(REPO_PATH):
print("Removing existing repo")
os.system("rm -rf " + REPO_PATH)
git.Repo.clone_from(REPO_URL, REPO_PATH)
dfs = []
for f in tqdm(glob(REPO_PATH + "/papers/*/*.csv")):
try:
tdf = pd.read_csv(f)
pid = f.split("/")[-1].replace(".csv", "")
tdf["paper_id"] = pid
dfs.append(tdf)
except:
print(f)
df = pd.concat(dfs, ignore_index=True)
df = df[~df["answer"].str.strip().str.startswith("Unfortunately")]
df = df.drop_duplicates(subset=["question", "answer"])
df = df.astype(str)
print(f"Total number of QA pairs: {len(df)}")
# total number of unique papers:
print(f"Total number of unique papers: {len(df['paper_id'].unique())}")
arxiv_qa_dataset = Dataset.from_pandas(df, preserve_index=False)
arxiv_qa_dataset.push_to_hub("taesiri/arxiv_qa")