Skip to content

Commit

Permalink
Add IPFS upload option
Browse files Browse the repository at this point in the history
  • Loading branch information
duckduckgrayduck committed Dec 20, 2024
1 parent 3d6f8dd commit b486cc0
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 3 deletions.
6 changes: 6 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ properties:
type: string
format: uri
description: Enter a slack webhook to enable Slack notifications
filecoin:
title: Push to IPFS/Filecoin
type: boolean
description: >-
WARNING: This will push all scraped files to IPFS and Filecoin.
There is no way to remove files from these storage systems.
required:
- feed
- feed_name
Expand Down
22 changes: 19 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
""" Fetches documents from RSS feeds
and uploads them to DocumentCloud
and IPFS
"""
import urllib.parse as urlparse

from ratelimit import limits, sleep_and_retry

import feedparser
from documentcloud.addon import AddOn
from documentcloud.constants import BULK_LIMIT
from documentcloud.toolbox import grouper, requests_retry_session
from ratelimit import limits, sleep_and_retry

DOC_CUTOFF = 10
MAX_NEW_DOCS = 10

FILECOIN_ID = 104

class Document:
"""Class to hold information about individual documents"""
Expand All @@ -29,6 +34,7 @@ def fixed_url(self):


class Fetcher(AddOn):
""" Add-On that fetches documents from RSS feeds """
@sleep_and_retry
@limits(calls=5, period=1)
def fetch(self, feed, depth=0):
Expand Down Expand Up @@ -56,9 +62,10 @@ def fetch(self, feed, depth=0):
return docs

def upload(self, docs):
""" Uploads documents to DocumentCloud in batches """
if self.data.get("dry_run"):
return

doc_ids = []
for doc_group in grouper(docs, BULK_LIMIT):
# filter out None's from grouper padding
doc_group = [d for d in doc_group if d]
Expand All @@ -77,6 +84,13 @@ def upload(self, docs):
]
resp = self.client.post("documents/", json=doc_group)
resp.raise_for_status()
doc_ids.extend([d.id for d in doc_group])

if self.data.get("filecoin") and doc_ids:
self.client.post(
"addon_runs/",
json={"addon": FILECOIN_ID, "parameters": {}, "documents": doc_ids},
)

def send_notification(self, subject, message):
"""Send notifications via slack and email"""
Expand All @@ -103,13 +117,15 @@ def send_scrape_message(self, new_docs):
self.send_notification(subj, body)

def set_project(self, user_input):
""" Method for setting a project by ID or title """
try:
self.project = int(user_input)
except ValueError:
project, created = self.client.projects.get_or_create_by_title(user_input)
self.project = project.id

def main(self):
""" Fetches the new docs and uploads them """
self.set_project(self.data["project"])
new_docs = self.fetch(self.data["feed"])
if new_docs:
Expand Down

0 comments on commit b486cc0

Please sign in to comment.