') js = f"a = {js}; a" p = js2py.eval_js(js)['items'][0] # Get specs specs_html = substr_between(product, """

") specs_html = BeautifulSoup(specs_html, "html.parser") specs = [li.text.strip().split(":", 1) for li in specs_html.findAll("li")] specs = {k.strip(): v.strip() for k, v in specs} items.append(Product( id=id, item_id=p['item_id'], name=p['item_name'], brand=p['item_brand'], categories=[p['item_category'], p['item_category_2'], p['item_category_3'], p['item_category_4']], price=float(p['price']), specs=specs )) log.debug(f"> Got: {items[-1].name}") except ValueError: log.warning("Failed to parse product") continue except JsException as e: log.warning(f"JsException: {e}") continue except Exception as e: log.warning(f"Exception: {e}") continue return items def crawl_url(url: str): """ Crawl Canada Computers items :param url: url to crawl """ file = Path("data/canada_computers_laptops.csv") if file.exists(): return pd.read_csv(file) items = [] i = 0 batch_size = 20 while True: r = list(range(i * batch_size + 1, (i + 1) * batch_size + 1)) log.info(f"Crawling batch {i}") # Get batch batch = tmap(lambda p: crawl_page(url, p), r, max_workers=10) # Flatten items += [item for sublist in batch for item in sublist] # If at least one batch is empty, we are done if any([len(b) == 0 for b in batch]): log.info(f"Done at batch {i}") break i += 1 file.parent.mkdir(parents=True, exist_ok=True) df = pd.DataFrame(items) df.to_csv(file, index=False) return items if __name__ == '__main__': # cPath 710 is laptops crawl_url("https://www.canadacomputers.com/index.php?cPath=710")