From 08f5ad90216144743c1a78a3f4135986ae036c80 Mon Sep 17 00:00:00 2001 From: Azalea Gui Date: Mon, 3 Apr 2023 10:15:17 -0400 Subject: [PATCH] [O] Configurable index path --- analysis.ipynb | 57 +++--------------------------------------------- config.toml | 2 ++ index_crawler.py | 4 ++-- requirements.txt | 8 +++++++ 4 files changed, 15 insertions(+), 56 deletions(-) create mode 100644 config.toml diff --git a/analysis.ipynb b/analysis.ipynb index 5eb5540..816acd1 100644 --- a/analysis.ipynb +++ b/analysis.ipynb @@ -1,57 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting seaborn\r\n", - " Downloading seaborn-0.12.2-py3-none-any.whl (293 kB)\r\n", - "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m293.3/293.3 kB\u001B[0m \u001B[31m7.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m\r\n", - "\u001B[?25hCollecting matplotlib\r\n", - " Downloading matplotlib-3.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\r\n", - "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m11.6/11.6 MB\u001B[0m \u001B[31m60.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m00:01\u001B[0m00:01\u001B[0m\r\n", - "\u001B[?25hRequirement already satisfied: pandas in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (1.5.3)\r\n", - "Requirement already satisfied: numpy in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (1.24.2)\r\n", - "Requirement already satisfied: tqdm in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (4.65.0)\r\n", - "Collecting rapidjson\r\n", - " Downloading rapidjson-1.0.0-py3-none-any.whl (1.2 kB)\r\n", - "Collecting contourpy>=1.0.1\r\n", - " Downloading contourpy-1.0.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)\r\n", - "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m300.0/300.0 kB\u001B[0m \u001B[31m42.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\r\n", - "\u001B[?25hCollecting cycler>=0.10\r\n", - " Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)\r\n", - "Collecting fonttools>=4.22.0\r\n", - " Downloading fonttools-4.39.3-py3-none-any.whl (1.0 MB)\r\n", - "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.0/1.0 MB\u001B[0m \u001B[31m69.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\r\n", - "\u001B[?25hCollecting kiwisolver>=1.0.1\r\n", - " Downloading kiwisolver-1.4.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)\r\n", - "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.4/1.4 MB\u001B[0m \u001B[31m63.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\r\n", - "\u001B[?25hRequirement already satisfied: packaging>=20.0 in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (from matplotlib) (23.0)\r\n", - "Requirement already satisfied: pillow>=6.2.0 in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (from matplotlib) (9.4.0)\r\n", - "Requirement already satisfied: pyparsing>=2.3.1 in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (from matplotlib) (3.0.9)\r\n", - "Requirement already satisfied: python-dateutil>=2.7 in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (from matplotlib) (2.8.2)\r\n", - "Requirement already satisfied: pytz>=2020.1 in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (from pandas) (2022.7.1)\r\n", - "Requirement already satisfied: six>=1.5 in /home/azalea/.conda/envs/311/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\r\n", - "Installing collected packages: rapidjson, kiwisolver, fonttools, cycler, contourpy, matplotlib, seaborn\r\n", - "Successfully installed contourpy-1.0.7 cycler-0.11.0 fonttools-4.39.3 kiwisolver-1.4.4 matplotlib-3.7.1 rapidjson-1.0.0 seaborn-0.12.2\r\n" - ] - } - ], - "source": [ - "!pip install seaborn matplotlib pandas numpy tqdm rapidjson" - ], - "metadata": { - "collapsed": false, - "ExecuteTime": { - "start_time": "2023-04-03T09:42:13.735175Z", - "end_time": "2023-04-03T09:42:28.796233Z" - } - } - }, { "cell_type": "code", "execution_count": 3, @@ -62,6 +10,7 @@ "import matplotlib\n", "import rapidjson as json\n", "import os\n", + "import toml\n", "from collections import Counter\n", "from pathlib import Path\n", "\n", @@ -140,8 +89,8 @@ " return json.loads(p.read_text())\n", "\n", "def load_data():\n", - " dir = Path('index-data')\n", - " files = [(dir / f) for f in tq(os.listdir(dir), 'Loading data') if f.endswith('.json')]\n", + " dir = Path(toml.loads(Path('config.toml').read_text())['index_path'])\n", + " files = [(dir / f) for f in tq(os.listdir(dir), 'Loading file list') if f.endswith('.json')]\n", " return pmap(_helper, files, desc='Loading json')\n", "\n", "\n", diff --git a/config.toml b/config.toml new file mode 100644 index 0000000..ea81c73 --- /dev/null +++ b/config.toml @@ -0,0 +1,2 @@ +# Moved the index data to .. to avoid IntelliJ being stuck on indexing it and processing file system changes +index_path = "../SuperbuyData/index-data" diff --git a/index_crawler.py b/index_crawler.py index 0a77dda..0e159d4 100644 --- a/index_crawler.py +++ b/index_crawler.py @@ -5,13 +5,13 @@ import json from pathlib import Path import requests - +import toml ses = requests.Session() ses.headers = {'accept-language': 'zh-CN'} -out_path = Path('index-data') +out_path = toml.loads(Path('config.toml').read_text())['index_path'] out_path.mkdir(exist_ok=True) diff --git a/requirements.txt b/requirements.txt index dcb1579..aaac7a3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,11 @@ hypy_utils uvicorn pysocks toml + +# Analysis +seaborn +matplotlib +pandas +numpy +tqdm +rapidjson