Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use common outgoing connection Session creation code for scraping #817

Merged
merged 3 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions mcweb/backend/search/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from django.views.decorators.http import require_http_methods
from mc_providers.exceptions import UnsupportedOperationException, QueryingEverythingUnsupportedQuery
from mc_providers.exceptions import ProviderException
from requests.adapters import HTTPAdapter
from rest_framework.authentication import SessionAuthentication, TokenAuthentication
from rest_framework.decorators import api_view, action, authentication_classes, permission_classes
from rest_framework.permissions import IsAuthenticated
Expand Down Expand Up @@ -51,13 +50,6 @@
# enable caching for mc_providers results (explicitly referencing pkg for clarity)
mc_providers.cache.CachingManager.cache_function = mc_providers_cacher

# not used in this file?
#session = requests.Session()
#retry = Retry(connect=3, backoff_factor=0.5)
#adapter = HTTPAdapter(max_retries=retry)
#session.mount('http://', adapter)
#session.mount('https://', adapter)

def error_response(msg: str, response_type: HttpResponse | None) -> HttpResponse:
ResponseClass = response_type or HttpResponseBadRequest
return ResponseClass(json.dumps(dict(
Expand Down
66 changes: 50 additions & 16 deletions mcweb/backend/sources/models.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,53 @@
import logging
from typing import Dict

# PyPI:
import feed_seeker
from mcmetadata.feeds import normalize_url
import mcmetadata.urls as urls
import requests
from django.db import models
from django.db.utils import IntegrityError
import requests
from mcmetadata.feeds import normalize_url
from mcmetadata.requests_arcana import insecure_requests_session
from mcmetadata.webpages import MEDIA_CLOUD_USER_AGENT

# not from PyPI: package installed via github URL
from mc_sitemap_tools.discover import find_gnews_fast
from settings import SCRAPE_TIMEOUT_SECONDS
from mc_sitemap_tools.discover import NewsDiscoverer

# mcweb
from settings import SCRAPE_TIMEOUT_SECONDS # time to scrape an entire source

logger = logging.getLogger(__name__)

# time for individual HTTP connect/read
SCRAPE_HTTP_SECONDS = SCRAPE_TIMEOUT_SECONDS / 5

def rss_page_fetcher(url: str) -> str:
"""
custom fetcher for RSS pages for feed_seeker
(adapted from from feed_seeker default_fetch_function)
"""
logger.debug("rss_page_fetcher %s", url)
session = insecure_requests_session(MEDIA_CLOUD_USER_AGENT)

try:
# provide connection and read timeouts in case alarm based timeout fails
# (scrapes sometimes hang).
response = session.get(url,
timeout=(SCRAPE_HTTP_SECONDS, SCRAPE_HTTP_SECONDS))
if response.ok:
return response.text
else:
return '' # non-fatal error
except (requests.ConnectTimeout, # connect timeout
requests.ConnectionError, # 404's
requests.ReadTimeout, # read timeout
requests.TooManyRedirects, # redirect loop
requests.exceptions.InvalidSchema, # email addresses
requests.exceptions.RetryError):
# signal page failure, but not bad enough to abandon site:
return ''

class Collection(models.Model):

class CollectionPlatforms(models.TextChoices):
Expand Down Expand Up @@ -211,16 +245,15 @@ def _scrape_source(source_id: int, homepage: str, name: str, verbosity: int = 1)
def add_line(line):
logger.debug("add_line: %s", line.rstrip()) # without newlines!
if not line.endswith("\n"):
logger.warning("missing newline on %s", line)
line += "\n"
lines.append(line)

# per-source header line
add_line(f"Scraped source {source_id} ({name}), {homepage}\n")
add_line(f"Scraped source {source_id} ({name}), {homepage}")

if not homepage:
add_line("MISSING HOMEPAGE\n")
return "".join(lines)
add_line("MISSING HOMEPAGE")
return "".join(lines) # error not indented

total = added = confirmed = 0
def process_urls(from_: str, urls: list[str]):
Expand All @@ -230,32 +263,33 @@ def process_urls(from_: str, urls: list[str]):
nurl = normalize_url(url)
if nurl in old_urls:
if verbosity >= 1:
add_line(f"found existing {from_} feed {url}\n")
add_line(f"found existing {from_} feed {url}")
logger.info(f"scrape_source({source_id}, {homepage}) found existing {from_} feed {url}")
confirmed += 1
else:
try:
feed = Feed(source_id=source_id, admin_rss_enabled=True, url=url)
feed.save()
add_line(f"added new {from_} feed {url}\n")
add_line(f"added new {from_} feed {url}")
logger.info(f"scrape_source({source_id}, {homepage}) added new {from_} feed {url}")
old_urls[nurl] = url # try to prevent trying to add twice
added += 1
except IntegrityError:
# happens when feed exists, but under a different source!
# could do lookup by URL, and report what source (name & id) it's under....
add_line(f"{from_} feed {url} exists under some other source!!!\n")
add_line(f"{from_} feed {url} exists under some other source!!!")
logger.warning(f"scrape_source({source_id}, {homepage}) duplicate {from_} feed {url} (exists under another source?)")

# end process_feeds

# Look for RSS feeds
try:
new_feed_generator = feed_seeker.generate_feed_urls(homepage, max_time=SCRAPE_TIMEOUT_SECONDS)
new_feed_generator = feed_seeker.generate_feed_urls(
homepage, max_time=SCRAPE_TIMEOUT_SECONDS, fetcher=rss_page_fetcher)
# create list so DB operations in process_urls are not under the timeout gun.
process_urls("rss", list(new_feed_generator))
except requests.RequestException as e: # maybe just catch Exception?
add_line(f"fatal error for rss: {e!r}\n")
add_line(f"fatal error for rss: {e!r}")
logger.warning("generate_feed_urls(%s): %r", homepage, e)
except TimeoutError:
add_line(f"timeout for rss")
Expand All @@ -266,17 +300,17 @@ def process_urls(from_: str, urls: list[str]):
sitemaps = "news sitemap" # say something once, why say it again?

try:
gnews_urls = find_gnews_fast(homepage, timeout=SCRAPE_TIMEOUT_SECONDS)
nd = NewsDiscoverer(MEDIA_CLOUD_USER_AGENT)
gnews_urls = nd.find_gnews_fast(homepage, timeout=SCRAPE_HTTP_SECONDS)
except requests.RequestException as e:
add_line(f"fatal error for {sitemaps}: {e!r}")
logger.exception("find_gnews_fast")

if gnews_urls:
process_urls(sitemaps, gnews_urls)


# after many tries to give a summary in english:
add_line(f"{added}/{total} added, {confirmed}/{old} confirmed\n")
add_line(f"{added}/{total} added, {confirmed}/{old} confirmed")

indent = " " # not applied to header line
return indent.join(lines)
Expand Down
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ djangorestframework-simplejwt[crypto]==5.2.*
pyhumps==3.8.*
django-environ==0.9.*
dj-database-url==1.0.*
mediacloud-metadata==1.*
mediacloud-metadata==1.1.*
python-dateutil==2.8.*
django-redis==5.2.*
gunicorn==20.1.*
Expand All @@ -19,7 +19,7 @@ mc-manage @ git+https://github.com/mediacloud/[email protected]
pycountry==22.3.*
django-background-tasks-updated==1.2.* # need >= 1.2.6
django-cors-headers==4.3.*
feed-seeker==1.0.*
feed-seeker==1.1.*
numpy
pandas
sitemap-tools @ git+https://github.com/mediacloud/sitemap-tools@v1.0.0
#pandas
sitemap-tools @ git+https://github.com/mediacloud/sitemap-tools@v2.0.0