[+] Passmark crawling

This commit is contained in:
2023-12-26 22:23:27 -05:00
parent 304e590f59
commit 8ddebfe15d
2 changed files with 4591 additions and 0 deletions
+4539
View File
File diff suppressed because it is too large Load Diff
+52
View File
@@ -0,0 +1,52 @@
"""
This file is used to crawl passmark data and store it as a csv
"""
from pathlib import Path
from typing import NamedTuple
import pandas as pd
import requests
from bs4 import BeautifulSoup
class CPU(NamedTuple):
id: int
name: str
passmark: int
def crawl_cpu() -> pd.DataFrame:
"""
Crawl cpu benchmark data
"""
file = Path("data/cpu.csv")
if file.exists():
return pd.read_csv(file)
url = "https://www.cpubenchmark.net/cpu_list.php"
page = requests.get(url)
bs = BeautifulSoup(page.content, "html.parser")
table = bs.find("table", {"id": "cputable"})
rows = table.findAll("tr")
cpu_list = []
for row in rows:
cols = row.findAll("td")
if len(cols) == 0:
continue
id = int(row["id"].replace("cpu", ""))
cpu = cols[0].text.strip()
passmark = int(cols[1].text.strip().replace(",", ""))
cpu_list.append(CPU(id=id, name=cpu, passmark=passmark))
file.parent.mkdir(parents=True, exist_ok=True)
df = pd.DataFrame(cpu_list)
df.to_csv(file, index=False)
return df
if __name__ == '__main__':
crawl_cpu()