147 lines
3.9 KiB
Python
147 lines
3.9 KiB
Python
"""
|
|
This file is used to crawl Canada Computers items
|
|
"""
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import NamedTuple
|
|
|
|
import js2py
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup, Tag, ResultSet
|
|
from hypy_utils import nlp_utils
|
|
from hypy_utils.logging_utils import setup_logger
|
|
from hypy_utils.nlp_utils import substr_between
|
|
from hypy_utils.tqdm_utils import tmap
|
|
from js2py.internals.simplex import JsException
|
|
|
|
log = setup_logger()
|
|
|
|
|
|
def get_match(pattern: str | re.Pattern, html: str) -> str | None:
|
|
"""
|
|
Get the first match of a pattern in html
|
|
|
|
:param pattern: regex pattern
|
|
:param html: html to search in
|
|
"""
|
|
match = re.search(pattern, html)
|
|
if match is None:
|
|
return None
|
|
return match.group(1)
|
|
|
|
|
|
RE_ID = re.compile(r'data-item-id="(\d+)"')
|
|
|
|
|
|
class Product(NamedTuple):
|
|
id: int
|
|
item_id: str
|
|
name: str
|
|
brand: str
|
|
categories: list[str]
|
|
price: float
|
|
specs: dict[str, str]
|
|
|
|
|
|
def crawl_page(url: str, p: int) -> list[Product]:
|
|
"""
|
|
Crawl a page of Canada Computers items
|
|
|
|
:param url: URL of an item listing page
|
|
:param p: Page number
|
|
:return: list of Product
|
|
"""
|
|
items = []
|
|
log.info(f"Crawling page {p}")
|
|
page = requests.get(url, params={"page": p, "ajax": "true"})
|
|
# We can't use BS4 because the html contains json that is not properly escaped
|
|
|
|
# Find productTemplate
|
|
products: list[str] = page.text.split("<div class=\"col-12 py-1 px-1 bg-white mb-1 productTemplate")[1:]
|
|
|
|
for product in products:
|
|
try:
|
|
# Get ID (regex, data-item-id="(\d+)")
|
|
id = substr_between(product, 'data-item-id="', '"')
|
|
|
|
# Get gtag, eval it to get the product name
|
|
js = substr_between(product, r"""onclick="gtag('event', 'select_item', """, ')" >')
|
|
js = f"a = {js}; a"
|
|
p = js2py.eval_js(js)['items'][0]
|
|
|
|
# Get specs
|
|
specs_html = substr_between(product, """<div class="d-inline pl-0_5" data-toggle="tooltip" data-html="true" title='""", "'>")
|
|
specs_html = BeautifulSoup(specs_html, "html.parser")
|
|
specs = [li.text.strip().split(":", 1) for li in specs_html.findAll("li")]
|
|
specs = {k.strip(): v.strip() for k, v in specs}
|
|
|
|
items.append(Product(
|
|
id=id,
|
|
item_id=p['item_id'],
|
|
name=p['item_name'],
|
|
brand=p['item_brand'],
|
|
categories=[p['item_category'], p['item_category_2'], p['item_category_3'], p['item_category_4']],
|
|
price=float(p['price']),
|
|
specs=specs
|
|
))
|
|
log.debug(f"> Got: {items[-1].name}")
|
|
|
|
except ValueError:
|
|
log.warning("Failed to parse product")
|
|
continue
|
|
|
|
except JsException as e:
|
|
log.warning(f"JsException: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
log.warning(f"Exception: {e}")
|
|
continue
|
|
|
|
return items
|
|
|
|
|
|
def crawl_url(url: str):
|
|
"""
|
|
Crawl Canada Computers items
|
|
|
|
:param url: url to crawl
|
|
"""
|
|
file = Path("data/canada_computers_laptops.csv")
|
|
|
|
if file.exists():
|
|
return pd.read_csv(file)
|
|
|
|
items = []
|
|
i = 0
|
|
batch_size = 20
|
|
while True:
|
|
r = list(range(i * batch_size + 1, (i + 1) * batch_size + 1))
|
|
log.info(f"Crawling batch {i}")
|
|
|
|
# Get batch
|
|
batch = tmap(lambda p: crawl_page(url, p), r, max_workers=10)
|
|
|
|
# Flatten
|
|
items += [item for sublist in batch for item in sublist]
|
|
|
|
# If at least one batch is empty, we are done
|
|
if any([len(b) == 0 for b in batch]):
|
|
log.info(f"Done at batch {i}")
|
|
break
|
|
|
|
i += 1
|
|
|
|
file.parent.mkdir(parents=True, exist_ok=True)
|
|
df = pd.DataFrame(items)
|
|
df.to_csv(file, index=False)
|
|
|
|
return items
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# cPath 710 is laptops
|
|
crawl_url("https://www.canadacomputers.com/index.php?cPath=710")
|