diff --git a/mindfactory/pipelines/database_pipeline.py b/mindfactory/pipelines/database_pipeline.py index 7b16b57..827467d 100644 --- a/mindfactory/pipelines/database_pipeline.py +++ b/mindfactory/pipelines/database_pipeline.py @@ -4,8 +4,12 @@ class DatabasePipeline(object): def __init__(self): self.connection = sqlite3.connect('./scrapedata.db') - self.connection.isolation_level = None # Disable autocommit self.cursor = self.connection.cursor() + # If this returns None, the table does not exist. In this case INSERT instead of UPDATE is used. + self.mode = self.cursor.execute(""" + SELECT name FROM sqlite_master + WHERE type='table' AND name='productdata' + """).fetchone() self.cursor.execute("""CREATE TABLE IF NOT EXISTS productdata ( id INTEGER PRIMARY KEY, @@ -29,22 +33,46 @@ def __init__(self): date TEXT, verified BIT )""") + self.cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS url_index ON productdata(url)") + self.cursor.execute("CREATE UNIQUE INDEX IF NOT EXISTS id_index ON productdata(id)") + self.cursor.execute("CREATE INDEX IF NOT EXISTS pid_index ON reviewdata(product_id)") + self.connection.commit() def close_spider(self, spider): self.cursor.close() self.connection.close() def process_item(self, item, spider): - self.cursor.execute("begin") - self.cursor.execute("""INSERT INTO productdata - (url, category, name, brand, ean, sku, count_sold, people_watching, rma_quote, price) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", - (item["url"], item["category"], item["name"], item["brand"], item["ean"], item["sku"], - item["count_sold"], item["people_watching"], item["rma_quote"], item["price"])) + # Either insert items, iff the scraper is run for the first time, or update their entries. + present = self.cursor.execute(f"SELECT id FROM productdata WHERE url = (?)", (item["url"],)).fetchone() + if not (self.mode and present): + self.cursor.execute(""" + INSERT INTO productdata + (url, category, name, brand, ean, sku, count_sold, people_watching, rma_quote, price) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + (item["url"], item["category"], item["name"], item["brand"], item["ean"], item["sku"], + item["count_sold"], item["people_watching"], item["rma_quote"], item["price"])) + else: + self.cursor.execute(f""" + UPDATE productdata + SET count_sold = (?), + people_watching = (?), + rma_quote = (?), + price = (?) + WHERE url = (?) + """, (item['count_sold'], item['people_watching'], item['rma_quote'], item['price'], + item["url"])) row_id = self.cursor.lastrowid for rev in item["reviews"]: - self.cursor.execute("""INSERT INTO reviewdata VALUES (?, ?, ?, ?, ?, ?)""", - (row_id, rev["stars"], rev["text"], rev["author"], rev["date"], rev["verified"])) + # Only insert reviews that are not yet present in the database. + if not self.cursor.execute(f""" + SELECT verified from reviewdata + WHERE author = (?) + AND date = (?) + """, (rev["author"], rev["date"])).fetchone(): + self.cursor.execute("""INSERT INTO reviewdata VALUES (?, ?, ?, ?, ?, ?)""", + (row_id, rev["stars"], rev["text"], rev["author"], rev["date"], rev["verified"])) self.connection.commit() return item diff --git a/mindfactory/spiders/product_spider.py b/mindfactory/spiders/product_spider.py index 46214d4..25045e3 100644 --- a/mindfactory/spiders/product_spider.py +++ b/mindfactory/spiders/product_spider.py @@ -21,8 +21,8 @@ def __init__(self, *args, **kwargs): self.product_brand_xpath = '//span[@itemprop="brand"]/text()' self.product_ean_xpath = '//span[@class="product-ean"]/text()' self.product_sku_xpath = '//span[@class="sku-model"]/text()' - self.product_sprice_xpath = '//span[@class="specialPriceText"]/text()' - self.product_price_xpath = '//div[@id="priceCol"]/div[@class="pprice"]/text()[3]' + self.product_price_xpath = '//span[@class="specialPriceText"]/text() |' \ + ' //div[@id="priceCol"]/div[@class="pprice"]/text()[3]' # First element is the amount of sold products; second element is the amount of people watching this product self.product_sold_or_people = '//*[@id="cart_quantity"]//div[@class="psold"]/text()[2]' self.product_count_xpath = '//*[@id="cart_quantity"]//div[@class="psold"]/span[@class="pcountsold"]/text()' @@ -34,11 +34,6 @@ def __init__(self, *args, **kwargs): self.review_date_xpath = 'div[1]/div/div[2]/span/text()' self.review_verified_xpath = 'div[1]/div/div[3]/strong/span' self.review_text_xpath = 'div[2]/div/text()' - # There are two different site structures containing the number of reviews for some reason. - self.review_number_xpath_old = '//span[@class="reviewcount"]/text()' - self.review_number_xpath_new = '//span[@itemprop="reviewCount"]/text()' - self.num_page_xpath = '//*[@id="moreReviews"]/div[4]/div[5]/div/div[3]/nav/ul/li[1]/a/text()' - self.reviews = [] super(ProductSpider, self).__init__(*args, **kwargs) def parse(self, response): @@ -67,17 +62,12 @@ def parse_product(self, response): item["ean"] = response.xpath(self.product_ean_xpath).extract_first(default=None) item["sku"] = response.xpath(self.product_sku_xpath).extract_first(default=None) # There are prices and special prices for some reason. - sprice = response.xpath(self.product_sprice_xpath).extract_first(default=None) price = response.xpath(self.product_price_xpath).extract_first(default=None) if price is not None: text = price.rstrip()[1:-1] if text: item["price"] = float(text.replace("-", "0").replace(".", "").replace(",", ".")) - if sprice is not None: - text = sprice.rstrip()[1:-1] - if text: - item["price"] = float(text.replace("-", "0").replace(".", "").replace(",", ".")) - if "price" not in item: + else: item["price"] = None count_and_people = response.xpath(self.product_count_xpath).extract() sold_or_people = response.xpath(self.product_sold_or_people).extract_first(default=None) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..14be00e --- /dev/null +++ b/setup.py @@ -0,0 +1,10 @@ +# Automatically created by: scrapyd-deploy + +from setuptools import setup, find_packages + +setup( + name='mindfactory', + version='1.0', + packages=find_packages(), + entry_points={'scrapy': ['settings = mindfactory.settings']}, install_requires=['scrapy'] +)