Skip to content

Commit

Permalink
Changes to the database handling.
Browse files Browse the repository at this point in the history
Signed-off-by: RobMcH <[email protected]>
  • Loading branch information
RobMcH committed Feb 11, 2019
1 parent 4e2f536 commit fb623e4
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 22 deletions.
46 changes: 37 additions & 9 deletions mindfactory/pipelines/database_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
class DatabasePipeline(object):
def __init__(self):
self.connection = sqlite3.connect('./scrapedata.db')
self.connection.isolation_level = None # Disable autocommit
self.cursor = self.connection.cursor()
# If this returns None, the table does not exist. In this case INSERT instead of UPDATE is used.
self.mode = self.cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name='productdata'
""").fetchone()
self.cursor.execute("""CREATE TABLE IF NOT EXISTS productdata
(
id INTEGER PRIMARY KEY,
Expand All @@ -29,22 +33,46 @@ def __init__(self):
date TEXT,
verified BIT
)""")
self.cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS url_index ON productdata(url)")
self.cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS id_index ON productdata(id)")
self.cursor.execute("CREATE INDEX IF NOT EXISTS pid_index ON reviewdata(product_id)")
self.connection.commit()

def close_spider(self, spider):
self.cursor.close()
self.connection.close()

def process_item(self, item, spider):
self.cursor.execute("begin")
self.cursor.execute("""INSERT INTO productdata
(url, category, name, brand, ean, sku, count_sold, people_watching, rma_quote, price)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(item["url"], item["category"], item["name"], item["brand"], item["ean"], item["sku"],
item["count_sold"], item["people_watching"], item["rma_quote"], item["price"]))
# Either insert items, iff the scraper is run for the first time, or update their entries.
present = self.cursor.execute(f"SELECT id FROM productdata WHERE url = (?)", (item["url"],)).fetchone()
if not (self.mode and present):
self.cursor.execute("""
INSERT INTO productdata
(url, category, name, brand, ean, sku, count_sold, people_watching, rma_quote, price)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(item["url"], item["category"], item["name"], item["brand"], item["ean"], item["sku"],
item["count_sold"], item["people_watching"], item["rma_quote"], item["price"]))
else:
self.cursor.execute(f"""
UPDATE productdata
SET count_sold = (?),
people_watching = (?),
rma_quote = (?),
price = (?)
WHERE url = (?)
""", (item['count_sold'], item['people_watching'], item['rma_quote'], item['price'],
item["url"]))
row_id = self.cursor.lastrowid
for rev in item["reviews"]:
self.cursor.execute("""INSERT INTO reviewdata VALUES (?, ?, ?, ?, ?, ?)""",
(row_id, rev["stars"], rev["text"], rev["author"], rev["date"], rev["verified"]))
# Only insert reviews that are not yet present in the database.
if not self.cursor.execute(f"""
SELECT verified from reviewdata
WHERE author = (?)
AND date = (?)
""", (rev["author"], rev["date"])).fetchone():
self.cursor.execute("""INSERT INTO reviewdata VALUES (?, ?, ?, ?, ?, ?)""",
(row_id, rev["stars"], rev["text"], rev["author"], rev["date"], rev["verified"]))
self.connection.commit()
return item

Expand Down
16 changes: 3 additions & 13 deletions mindfactory/spiders/product_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def __init__(self, *args, **kwargs):
self.product_brand_xpath = '//span[@itemprop="brand"]/text()'
self.product_ean_xpath = '//span[@class="product-ean"]/text()'
self.product_sku_xpath = '//span[@class="sku-model"]/text()'
self.product_sprice_xpath = '//span[@class="specialPriceText"]/text()'
self.product_price_xpath = '//div[@id="priceCol"]/div[@class="pprice"]/text()[3]'
self.product_price_xpath = '//span[@class="specialPriceText"]/text() |' \
' //div[@id="priceCol"]/div[@class="pprice"]/text()[3]'
# First element is the amount of sold products; second element is the amount of people watching this product
self.product_sold_or_people = '//*[@id="cart_quantity"]//div[@class="psold"]/text()[2]'
self.product_count_xpath = '//*[@id="cart_quantity"]//div[@class="psold"]/span[@class="pcountsold"]/text()'
Expand All @@ -34,11 +34,6 @@ def __init__(self, *args, **kwargs):
self.review_date_xpath = 'div[1]/div/div[2]/span/text()'
self.review_verified_xpath = 'div[1]/div/div[3]/strong/span'
self.review_text_xpath = 'div[2]/div/text()'
# There are two different site structures containing the number of reviews for some reason.
self.review_number_xpath_old = '//span[@class="reviewcount"]/text()'
self.review_number_xpath_new = '//span[@itemprop="reviewCount"]/text()'
self.num_page_xpath = '//*[@id="moreReviews"]/div[4]/div[5]/div/div[3]/nav/ul/li[1]/a/text()'
self.reviews = []
super(ProductSpider, self).__init__(*args, **kwargs)

def parse(self, response):
Expand Down Expand Up @@ -67,17 +62,12 @@ def parse_product(self, response):
item["ean"] = response.xpath(self.product_ean_xpath).extract_first(default=None)
item["sku"] = response.xpath(self.product_sku_xpath).extract_first(default=None)
# There are prices and special prices for some reason.
sprice = response.xpath(self.product_sprice_xpath).extract_first(default=None)
price = response.xpath(self.product_price_xpath).extract_first(default=None)
if price is not None:
text = price.rstrip()[1:-1]
if text:
item["price"] = float(text.replace("-", "0").replace(".", "").replace(",", "."))
if sprice is not None:
text = sprice.rstrip()[1:-1]
if text:
item["price"] = float(text.replace("-", "0").replace(".", "").replace(",", "."))
if "price" not in item:
else:
item["price"] = None
count_and_people = response.xpath(self.product_count_xpath).extract()
sold_or_people = response.xpath(self.product_sold_or_people).extract_first(default=None)
Expand Down
10 changes: 10 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Automatically created by: scrapyd-deploy

from setuptools import setup, find_packages

setup(
name='mindfactory',
version='1.0',
packages=find_packages(),
entry_points={'scrapy': ['settings = mindfactory.settings']}, install_requires=['scrapy']
)

0 comments on commit fb623e4

Please sign in to comment.