diff --git a/mindfactory/pipelines/database_pipeline.py b/mindfactory/pipelines/database_pipeline.py index fe847c0..7b16b57 100644 --- a/mindfactory/pipelines/database_pipeline.py +++ b/mindfactory/pipelines/database_pipeline.py @@ -2,35 +2,52 @@ class DatabasePipeline(object): - def __init__(self): self.connection = sqlite3.connect('./scrapedata.db') + self.connection.isolation_level = None # Disable autocommit self.cursor = self.connection.cursor() - self.cursor.execute("""CREATE TABLE IF NOT EXISTS productdata (id INTEGER PRIMARY KEY, url TEXT, category TEXT, - name TEXT, brand TEXT, ean INTEGER, sku TEXT, count_sold INTEGER, people_watching INTEGER, - rma_quote INTEGER, price REAL)""") - self.cursor.execute("""CREATE TABLE IF NOT EXISTS reviewdata (product_id INTEGER, stars INTEGER, - review_text TEXT, author TEXT, date TEXT, verified TEXT)""") + self.cursor.execute("""CREATE TABLE IF NOT EXISTS productdata + ( + id INTEGER PRIMARY KEY, + url TEXT NOT NULL, + category TEXT, + name TEXT, + brand TEXT, + ean CHAR(13), + sku TEXT, + count_sold INTEGER, + people_watching INTEGER, + rma_quote INTEGER, + price REAL + )""") + self.cursor.execute("""CREATE TABLE IF NOT EXISTS reviewdata + ( + product_id INTEGER REFERENCES productdata (id) NOT NULL, + stars INTEGER, + review_text TEXT, + author TEXT, + date TEXT, + verified BIT + )""") def close_spider(self, spider): self.cursor.close() self.connection.close() def process_item(self, item, spider): - self.cursor.execute("""INSERT INTO productdata (url, category, name, brand, ean, sku, count_sold, - people_watching, rma_quote, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - (item["url"], item["category"], - item["name"], item["brand"], item["ean"], item["sku"], item["count_sold"], - item["people_watching"], - item["rma_quote"], item["price"])) + self.cursor.execute("begin") + self.cursor.execute("""INSERT INTO productdata + (url, category, name, brand, ean, sku, count_sold, people_watching, rma_quote, price) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (item["url"], item["category"], item["name"], item["brand"], item["ean"], item["sku"], + item["count_sold"], item["people_watching"], item["rma_quote"], item["price"])) row_id = self.cursor.lastrowid - for review in item["reviews"]: + for rev in item["reviews"]: self.cursor.execute("""INSERT INTO reviewdata VALUES (?, ?, ?, ?, ?, ?)""", - (row_id, review["stars"], review["text"], review["author"], review["date"], - review["verified"])) + (row_id, rev["stars"], rev["text"], rev["author"], rev["date"], rev["verified"])) self.connection.commit() return item def handle_error(self, e): - # Next level error handling + # Next level error handling. pass diff --git a/mindfactory/settings.py b/mindfactory/settings.py index cf00303..e1dd6bf 100644 --- a/mindfactory/settings.py +++ b/mindfactory/settings.py @@ -14,6 +14,8 @@ SPIDER_MODULES = ['mindfactory.spiders'] NEWSPIDER_MODULE = 'mindfactory.spiders' +# Log only information output +LOG_LEVEL = "INFO" # Crawl responsibly by identifying yourself (and your website) on the user-agent USER_AGENT = 'mindfactory_bot' diff --git a/mindfactory/spiders/product_spider.py b/mindfactory/spiders/product_spider.py index dbc3d47..46214d4 100644 --- a/mindfactory/spiders/product_spider.py +++ b/mindfactory/spiders/product_spider.py @@ -64,8 +64,8 @@ def parse_product(self, response): item["category"] = response.xpath(self.product_category).extract_first() item["name"] = response.xpath(self.product_name_xpath).extract_first() item["brand"] = response.xpath(self.product_brand_xpath).extract_first() - item["ean"] = response.xpath(self.product_ean_xpath).extract_first(default=-1) - item["sku"] = response.xpath(self.product_sku_xpath).extract_first(default=-1) + item["ean"] = response.xpath(self.product_ean_xpath).extract_first(default=None) + item["sku"] = response.xpath(self.product_sku_xpath).extract_first(default=None) # There are prices and special prices for some reason. sprice = response.xpath(self.product_sprice_xpath).extract_first(default=None) price = response.xpath(self.product_price_xpath).extract_first(default=None) @@ -78,19 +78,19 @@ def parse_product(self, response): if text: item["price"] = float(text.replace("-", "0").replace(".", "").replace(",", ".")) if "price" not in item: - item["price"] = -1 + item["price"] = None count_and_people = response.xpath(self.product_count_xpath).extract() sold_or_people = response.xpath(self.product_sold_or_people).extract_first(default=None) if len(count_and_people) == 2: item["count_sold"] = int(count_and_people[0].replace(".", "")) item["people_watching"] = int(count_and_people[1].replace(".", "")) elif len(count_and_people) == 1: - item["count_sold"] = int(count_and_people[0].replace(".", "")) if sold_or_people is not None else -1 - item["people_watching"] = -1 if sold_or_people is not None else int(count_and_people[0].replace(".", "")) + item["count_sold"] = int(count_and_people[0].replace(".", "")) if sold_or_people is not None else None + item["people_watching"] = None if sold_or_people is not None else int(count_and_people[0].replace(".", "")) else: - item["count_sold"] = item["people_watching"] = -1 + item["count_sold"] = item["people_watching"] = None rma = response.xpath(self.product_rma_xpath).extract_first(default=None) - item["rma_quote"] = int(rma.strip()[:-1]) if rma is not None else -1 + item["rma_quote"] = int(rma.strip()[:-1]) if rma is not None else None item["reviews"] = [] for review in response.xpath(self.review_xpath): item["reviews"].append(self.parse_review(review)) @@ -116,9 +116,10 @@ def parse_review(self, review): rev = ReviewItem() rev["stars"] = len(review.xpath(self.review_stars_xpath)) rev["author"] = review.xpath(self.review_author_xpath).extract_first() - rev["date"] = review.xpath(self.review_date_xpath).extract_first()[3:] - rev["verified"] = "True" if review.xpath(self.review_verified_xpath).extract_first( - default=None) is not None else "False" + date_list = review.xpath(self.review_date_xpath).extract_first()[3:].split(".") + rev["date"] = f"{date_list[2]}-{date_list[1]}-{date_list[0]}" + rev["verified"] = 1 if review.xpath(self.review_verified_xpath).extract_first( + default=None) is not None else 0 text = review.xpath(self.review_text_xpath).extract() rev["text"] = " ".join([x.strip() for x in text]) return rev