Skip to content

Commit

Permalink
Changes to the database handling.
Browse files Browse the repository at this point in the history
Signed-off-by: RobMcH <[email protected]>
  • Loading branch information
RobMcH committed Jan 28, 2019
1 parent 06e0946 commit 4e2f536
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 26 deletions.
49 changes: 33 additions & 16 deletions mindfactory/pipelines/database_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,52 @@


class DatabasePipeline(object):

def __init__(self):
self.connection = sqlite3.connect('./scrapedata.db')
self.connection.isolation_level = None # Disable autocommit
self.cursor = self.connection.cursor()
self.cursor.execute("""CREATE TABLE IF NOT EXISTS productdata (id INTEGER PRIMARY KEY, url TEXT, category TEXT,
name TEXT, brand TEXT, ean INTEGER, sku TEXT, count_sold INTEGER, people_watching INTEGER,
rma_quote INTEGER, price REAL)""")
self.cursor.execute("""CREATE TABLE IF NOT EXISTS reviewdata (product_id INTEGER, stars INTEGER,
review_text TEXT, author TEXT, date TEXT, verified TEXT)""")
self.cursor.execute("""CREATE TABLE IF NOT EXISTS productdata
(
id INTEGER PRIMARY KEY,
url TEXT NOT NULL,
category TEXT,
name TEXT,
brand TEXT,
ean CHAR(13),
sku TEXT,
count_sold INTEGER,
people_watching INTEGER,
rma_quote INTEGER,
price REAL
)""")
self.cursor.execute("""CREATE TABLE IF NOT EXISTS reviewdata
(
product_id INTEGER REFERENCES productdata (id) NOT NULL,
stars INTEGER,
review_text TEXT,
author TEXT,
date TEXT,
verified BIT
)""")

def close_spider(self, spider):
self.cursor.close()
self.connection.close()

def process_item(self, item, spider):
self.cursor.execute("""INSERT INTO productdata (url, category, name, brand, ean, sku, count_sold,
people_watching, rma_quote, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(item["url"], item["category"],
item["name"], item["brand"], item["ean"], item["sku"], item["count_sold"],
item["people_watching"],
item["rma_quote"], item["price"]))
self.cursor.execute("begin")
self.cursor.execute("""INSERT INTO productdata
(url, category, name, brand, ean, sku, count_sold, people_watching, rma_quote, price)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(item["url"], item["category"], item["name"], item["brand"], item["ean"], item["sku"],
item["count_sold"], item["people_watching"], item["rma_quote"], item["price"]))
row_id = self.cursor.lastrowid
for review in item["reviews"]:
for rev in item["reviews"]:
self.cursor.execute("""INSERT INTO reviewdata VALUES (?, ?, ?, ?, ?, ?)""",
(row_id, review["stars"], review["text"], review["author"], review["date"],
review["verified"]))
(row_id, rev["stars"], rev["text"], rev["author"], rev["date"], rev["verified"]))
self.connection.commit()
return item

def handle_error(self, e):
# Next level error handling
# Next level error handling.
pass
2 changes: 2 additions & 0 deletions mindfactory/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
SPIDER_MODULES = ['mindfactory.spiders']
NEWSPIDER_MODULE = 'mindfactory.spiders'

# Log only information output
LOG_LEVEL = "INFO"

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'mindfactory_bot'
Expand Down
21 changes: 11 additions & 10 deletions mindfactory/spiders/product_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def parse_product(self, response):
item["category"] = response.xpath(self.product_category).extract_first()
item["name"] = response.xpath(self.product_name_xpath).extract_first()
item["brand"] = response.xpath(self.product_brand_xpath).extract_first()
item["ean"] = response.xpath(self.product_ean_xpath).extract_first(default=-1)
item["sku"] = response.xpath(self.product_sku_xpath).extract_first(default=-1)
item["ean"] = response.xpath(self.product_ean_xpath).extract_first(default=None)
item["sku"] = response.xpath(self.product_sku_xpath).extract_first(default=None)
# There are prices and special prices for some reason.
sprice = response.xpath(self.product_sprice_xpath).extract_first(default=None)
price = response.xpath(self.product_price_xpath).extract_first(default=None)
Expand All @@ -78,19 +78,19 @@ def parse_product(self, response):
if text:
item["price"] = float(text.replace("-", "0").replace(".", "").replace(",", "."))
if "price" not in item:
item["price"] = -1
item["price"] = None
count_and_people = response.xpath(self.product_count_xpath).extract()
sold_or_people = response.xpath(self.product_sold_or_people).extract_first(default=None)
if len(count_and_people) == 2:
item["count_sold"] = int(count_and_people[0].replace(".", ""))
item["people_watching"] = int(count_and_people[1].replace(".", ""))
elif len(count_and_people) == 1:
item["count_sold"] = int(count_and_people[0].replace(".", "")) if sold_or_people is not None else -1
item["people_watching"] = -1 if sold_or_people is not None else int(count_and_people[0].replace(".", ""))
item["count_sold"] = int(count_and_people[0].replace(".", "")) if sold_or_people is not None else None
item["people_watching"] = None if sold_or_people is not None else int(count_and_people[0].replace(".", ""))
else:
item["count_sold"] = item["people_watching"] = -1
item["count_sold"] = item["people_watching"] = None
rma = response.xpath(self.product_rma_xpath).extract_first(default=None)
item["rma_quote"] = int(rma.strip()[:-1]) if rma is not None else -1
item["rma_quote"] = int(rma.strip()[:-1]) if rma is not None else None
item["reviews"] = []
for review in response.xpath(self.review_xpath):
item["reviews"].append(self.parse_review(review))
Expand All @@ -116,9 +116,10 @@ def parse_review(self, review):
rev = ReviewItem()
rev["stars"] = len(review.xpath(self.review_stars_xpath))
rev["author"] = review.xpath(self.review_author_xpath).extract_first()
rev["date"] = review.xpath(self.review_date_xpath).extract_first()[3:]
rev["verified"] = "True" if review.xpath(self.review_verified_xpath).extract_first(
default=None) is not None else "False"
date_list = review.xpath(self.review_date_xpath).extract_first()[3:].split(".")
rev["date"] = f"{date_list[2]}-{date_list[1]}-{date_list[0]}"
rev["verified"] = 1 if review.xpath(self.review_verified_xpath).extract_first(
default=None) is not None else 0
text = review.xpath(self.review_text_xpath).extract()
rev["text"] = " ".join([x.strip() for x in text])
return rev

0 comments on commit 4e2f536

Please sign in to comment.