From 33e7ef016d618a438ef2ebb8d12e92a655f27ad7 Mon Sep 17 00:00:00 2001 From: liudongkai Date: Thu, 5 Dec 2024 21:10:31 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20xhs=20=E9=9D=9E=E4=BB=A3=E7=90=86?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=E5=A2=9E=E5=8A=A0=E9=9A=8F=E6=9C=BA?= =?UTF-8?q?=E7=AD=89=E5=BE=85=E9=97=B4=E9=9A=94,=20db=E5=AD=98=E5=82=A8?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=E5=A2=9E=E5=8A=A0=E5=AD=98=E5=82=A8?= =?UTF-8?q?xsec=5Ftoken=E5=AD=97=E6=AE=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 5 +++++ media_platform/xhs/core.py | 23 ++++++++++++++++++++--- schema/tables.sql | 4 +++- store/xhs/__init__.py | 1 + 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/config/base_config.py b/config/base_config.py index 6d6d8b85..d5a2b0c4 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -21,10 +21,15 @@ CRAWLER_TYPE = ( "search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据) ) +# 自定义User Agent(暂时仅对XHS有效) +UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0' # 是否开启 IP 代理 ENABLE_IP_PROXY = False +# 未启用代理时的最大爬取间隔,单位秒(暂时仅对XHS有效) +CRAWLER_MAX_SLEEP_SEC = 2 + # 代理IP池数量 IP_PROXY_POOL_COUNT = 2 diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 0061aa94..532273bc 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -12,6 +12,7 @@ import asyncio import os import random +import time from asyncio import Task from typing import Dict, List, Optional, Tuple @@ -42,7 +43,7 @@ class XiaoHongShuCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://www.xiaohongshu.com" # self.user_agent = utils.get_user_agent() - self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" + self.user_agent = config.UA if config.UA else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" async def start(self) -> None: playwright_proxy_format, httpx_proxy_format = None, None @@ -195,10 +196,15 @@ async def get_creators_and_notes(self) -> None: if createor_info: await xhs_store.save_creator(user_id, creator=createor_info) + # When proxy is not enabled, increase the crawling interval + if config.ENABLE_IP_PROXY: + crawl_interval = random.random() + else: + crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) # Get all note information of the creator all_notes_list = await self.xhs_client.get_all_notes_by_creator( user_id=user_id, - crawl_interval=random.random(), + crawl_interval=crawl_interval, callback=self.fetch_creator_notes_detail, ) @@ -280,6 +286,11 @@ async def get_note_detail_async_task( """ note_detail_from_html, note_detail_from_api = None, None async with semaphore: + # When proxy is not enabled, increase the crawling interval + if config.ENABLE_IP_PROXY: + crawl_interval = random.random() + else: + crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) try: # 尝试直接获取网页版笔记详情,携带cookie note_detail_from_html: Optional[Dict] = ( @@ -287,6 +298,7 @@ async def get_note_detail_async_task( note_id, xsec_source, xsec_token, enable_cookie=True ) ) + time.sleep(crawl_interval) if not note_detail_from_html: # 如果网页版笔记详情获取失败,则尝试不使用cookie获取 note_detail_from_html = ( @@ -354,10 +366,15 @@ async def get_comments( utils.logger.info( f"[XiaoHongShuCrawler.get_comments] Begin get note id comments {note_id}" ) + # When proxy is not enabled, increase the crawling interval + if config.ENABLE_IP_PROXY: + crawl_interval = random.random() + else: + crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC) await self.xhs_client.get_note_all_comments( note_id=note_id, xsec_token=xsec_token, - crawl_interval=random.random(), + crawl_interval=crawl_interval, callback=xhs_store.batch_update_xhs_note_comments, max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) diff --git a/schema/tables.sql b/schema/tables.sql index 46e14a22..c7d83122 100644 --- a/schema/tables.sql +++ b/schema/tables.sql @@ -534,4 +534,6 @@ CREATE TABLE `zhihu_creator` ( -- add column `like_count` to douyin_aweme_comment -alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数'; \ No newline at end of file +alter table douyin_aweme_comment add column `like_count` varchar(255) NOT NULL DEFAULT '0' COMMENT '点赞数'; + +alter table xhs_note add column xsec_token varchar(50) default null comment '签名算法'; \ No newline at end of file diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index c320a321..a31389ec 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -107,6 +107,7 @@ async def update_xhs_note(note_item: Dict): "last_modify_ts": utils.get_current_timestamp(), "note_url": f"https://www.xiaohongshu.com/explore/{note_id}?xsec_token={note_item.get('xsec_token')}&xsec_source=pc_search", "source_keyword": source_keyword_var.get(), + "xsec_token": note_item.get("xsec_token"), } utils.logger.info(f"[store.xhs.update_xhs_note] xhs note: {local_db_item}") await XhsStoreFactory.create_store().store_content(local_db_item)