[+] Passmark gpu info
This commit is contained in:
File diff suppressed because it is too large
Load Diff
+65
-2
@@ -1,14 +1,20 @@
|
||||
"""
|
||||
This file is used to crawl passmark data and store it as a csv
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from hypy_utils.logging_utils import setup_logger
|
||||
from hypy_utils.tqdm_utils import tmap
|
||||
from orjson import orjson
|
||||
|
||||
|
||||
log = setup_logger()
|
||||
|
||||
class Processor(NamedTuple):
|
||||
id: str
|
||||
name: str
|
||||
@@ -50,6 +56,63 @@ def crawl_cpu_gpu(cpu: bool) -> pd.DataFrame:
|
||||
return df
|
||||
|
||||
|
||||
def crawl_id(id: str):
|
||||
"""
|
||||
Crawl cpu/gpu benchmark data
|
||||
|
||||
:param id: id of cpu/gpu
|
||||
"""
|
||||
cpu = id.startswith("cpu")
|
||||
id = id[3:]
|
||||
url = f"https://www.cpubenchmark.net/cpu.php" if cpu else f"https://www.videocardbenchmark.net/gpu.php"
|
||||
page = requests.get(url, params={"id": id})
|
||||
bs = BeautifulSoup(page.content, "html5lib")
|
||||
|
||||
desc = bs.find("div", {"class": "desc"})
|
||||
name = desc.find("span", {"class": "cpuname"}).text.strip()
|
||||
specs = {}
|
||||
spec_nodes = (list(desc.find("em", {"class": "left-desc-cpu"}).findAll("p")) +
|
||||
list(desc.find("div", {"class": "desc-foot"}).findAll("p")))
|
||||
for spec in spec_nodes:
|
||||
key = spec.find("strong")
|
||||
if not key:
|
||||
continue
|
||||
key = key.text.strip()
|
||||
value = spec.text.strip().replace(key, "").strip().replace("\u00a0", "")
|
||||
specs[key.strip(":").replace("\u00a0", "")] = value
|
||||
|
||||
# Parse score
|
||||
score_lines = desc.find("div", {"class": "right-desc"}).text.strip().splitlines()[1:]
|
||||
score = int(score_lines.pop(0).strip().strip("\t"))
|
||||
specs["Score"] = score
|
||||
for line in score_lines:
|
||||
if ':' not in line:
|
||||
continue
|
||||
key, value = line.split(":")
|
||||
key = key.strip()
|
||||
value = int(value.strip())
|
||||
specs[key] = value
|
||||
|
||||
return name, specs
|
||||
|
||||
|
||||
def crawl_gpuinfo_batch():
|
||||
gpu_info_f = Path("data/gpu_info.json")
|
||||
gpu_info = {} if not gpu_info_f.exists() else json.loads(gpu_info_f.read_text())
|
||||
left = [gpu for gpu in crawl_cpu_gpu(False)["id"] if gpu not in gpu_info]
|
||||
|
||||
# Batch
|
||||
bs = 100
|
||||
while len(left) > 0:
|
||||
log.info(f"Crawling batch of {bs}, {len(left)} left")
|
||||
batch = left[:bs]
|
||||
left = left[bs:]
|
||||
info = tmap(lambda id: crawl_id(id), batch, max_workers=20)
|
||||
gpu_info.update({id: info for id, info in zip(batch, info)})
|
||||
gpu_info_f.write_text(json.dumps(gpu_info, indent=4))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawl_cpu_gpu(True)
|
||||
crawl_cpu_gpu(False)
|
||||
# crawl_cpu_gpu(True)
|
||||
# crawl_cpu_gpu(False)
|
||||
crawl_gpuinfo_batch()
|
||||
|
||||
Reference in New Issue
Block a user