From f69c4377bc021451c7e7ffae95ad5dc967ae92ea Mon Sep 17 00:00:00 2001 From: Ernest Litvinenko Date: Tue, 6 Feb 2024 10:47:22 +0300 Subject: [PATCH] add parser --- excel_parser.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 53 ++++++++++++++++++++++------- requirements.in | 12 ++++++- requirements.txt | 78 ++++++++++++++++++++++++++++++++++++++++++- storage.py | 42 +++++++++++++++++++++++ 5 files changed, 258 insertions(+), 14 deletions(-) create mode 100644 excel_parser.py create mode 100644 storage.py diff --git a/excel_parser.py b/excel_parser.py new file mode 100644 index 0000000..593dcee --- /dev/null +++ b/excel_parser.py @@ -0,0 +1,87 @@ +import pathlib +from sqlite3 import Row + +import requests +from openpyxl import load_workbook, Workbook +from openpyxl.cell import Cell +from openpyxl.worksheet.worksheet import Worksheet +from pathlib import Path +import pandas as pd + +API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress" +TYPE = "1" +PICKUP = "1" +DELIVERY = "1" +USER = "2252130929823409" +TOKEN = "67065749269910593" + + +class ExcelParser: + def __init__(self, path: Path | str): + self._wb: Workbook | None = None + if isinstance(path, str): + path = Path(path) + + assert path.is_file() is True + self.convert_to_xlsx(path) + self._sheet: Worksheet = self._wb.active + + def convert_to_xlsx(self, path: pathlib.Path): + import pyexcel as pe + if path.suffix == '.xlsx': + self._wb = load_workbook(str(path)) + return + pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx'))) + self._wb = load_workbook(str(path.with_suffix('.xlsx'))) + + def clean_up_wb(self) -> pd.DataFrame: + triggered = False + for row in self._sheet.rows: + for cell in row: + cell: Cell + row: Row + if isinstance(cell.value, str) and cell.value.startswith('№ заявки'): + self._sheet.delete_rows(1, cell.row - 1) + triggered = True + break + + if triggered: + break + + df = pd.DataFrame(self._sheet.values) + not_nullable_cols = df.iloc[0].dropna().index + df = df.iloc[:, not_nullable_cols] + df.columns = df.iloc[0, :] + df: pd.DataFrame = df.drop(0, axis=0) + + df.to_excel('./text.xlsx') + + return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0] + + def calculate(self) -> int | None: + df = self.clean_up_wb() + query = { + "type": TYPE, + "token": TOKEN, + "delivery": DELIVERY, + "pickup": PICKUP, + "user": USER, + "addr_from": df['Адрес отгрузки'], + "addr_to": df['Адрес склада'], + "weight": df['Масса'], + "volume": df['Объем'], + } + if query['volume'] is None: + query['volume'] = 0.01 + print(query) + data = requests.get(API_CALC_URL, params=query).json() + print(data) + + if data.get('price', None) is not None: + return int(data['price']) + return None + + +if __name__ == '__main__': + parser = ExcelParser('./downloads/1342AWP1A.xls') + print(parser.calculate()) diff --git a/main.py b/main.py index efc0aa7..e3830e4 100644 --- a/main.py +++ b/main.py @@ -1,15 +1,18 @@ import pathlib +import os import time import re from typing import Self -from selenium.webdriver import Keys +from selenium.webdriver import Keys, ActionChains from selenium.webdriver.common.by import By -from webdriver_manager.chrome import ChromeDriverManager +from webdriver_manager.chrome import ChromeDriverManager from selenium import webdriver from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException, TimeoutException +from excel_parser import ExcelParser +from storage import Storage class Parser: @@ -23,11 +26,14 @@ class Parser: def __init__(self): prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())} self._options.add_experimental_option("prefs", prefs) - self._options.add_argument("--disable-extensions") - self._options.add_argument("--disable-gpu") - self._options.add_argument("--headless=new") + # self._options.add_argument("--disable-extensions") + # self._options.add_argument("--disable-gpu") + # self._options.add_argument("--headless=new") self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install()) + self.storage = Storage() + self.storage.create_tables() + def __enter__(self) -> Self: self._driver = webdriver.Chrome(service=self._service, options=self._options) return self @@ -84,28 +90,51 @@ class Parser: # Скачать документацию try: - download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]') - time.sleep(1) + download_documentation_button = self.find_elem(By.CSS_SELECTOR, + '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]') + time.sleep(5) + ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform() download_documentation_button.click() - self.download_documentation() + fp = self.download_documentation() + + for file in fp: + e_parser = ExcelParser(file) + price = e_parser.calculate() + if not price: + break + self.storage.add_link(str(file.absolute()), url, price) except NoSuchElementException: - self.download_documentation() + fp = self.download_documentation() + + for file in fp: + e_parser = ExcelParser(file) + price = e_parser.calculate() + if not price: + break + self.storage.add_link(str(file.absolute()), url, price) + + def download_documentation(self) -> list[pathlib.Path]: + all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2]) - def download_documentation(self): time.sleep(5) documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation') docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a') for doc in docs: href = doc.get_attribute('href') - if not href.endswith('.xlsx'): + if not href.endswith('.xlsx') and not href.endswith('.xls'): continue self._driver.get(href) time.sleep(3) + all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2]) + + fp = all_files_2 - all_files_1 + + return [file for file in fp] + if __name__ == "__main__": with Parser() as parser: parser.login() parser.search() - diff --git a/requirements.in b/requirements.in index dcbe6a6..b2966c9 100644 --- a/requirements.in +++ b/requirements.in @@ -1,2 +1,12 @@ selenium -webdriver-manager \ No newline at end of file +webdriver-manager +openpyxl +pandas +requests +xlrd +xlwt +xlutils +xls2xlsx +pyexcel +pyexcel-xls +pyexcel-xlsx \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index cf0cbb8..b92c535 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,38 +8,95 @@ attrs==23.2.0 # via # outcome # trio +beautifulsoup4==4.12.3 + # via xls2xlsx certifi==2023.11.17 # via # requests # selenium +chardet==5.2.0 + # via + # pyexcel + # xls2xlsx charset-normalizer==3.3.2 # via requests +cssutils==2.9.0 + # via xls2xlsx +currency-symbols==2.0.3 + # via xls2xlsx +et-xmlfile==1.1.0 + # via openpyxl exceptiongroup==1.2.0 # via # trio # trio-websocket +fonttools==4.47.2 + # via xls2xlsx h11==0.14.0 # via wsproto idna==3.6 # via # requests # trio +lml==0.1.0 + # via + # pyexcel + # pyexcel-io +numpy==1.26.3 + # via pandas +openpyxl==3.1.2 + # via + # -r requirements.in + # pyexcel-xlsx + # xls2xlsx outcome==1.3.0.post0 # via trio packaging==23.2 # via webdriver-manager +pandas==2.2.0 + # via -r requirements.in +pillow==10.2.0 + # via xls2xlsx +pyexcel==0.7.0 + # via -r requirements.in +pyexcel-io==0.6.6 + # via + # pyexcel + # pyexcel-xls + # pyexcel-xlsx +pyexcel-xls==0.7.0 + # via -r requirements.in +pyexcel-xlsx==0.6.0 + # via -r requirements.in pysocks==1.7.1 # via urllib3 +python-dateutil==2.8.2 + # via + # pandas + # xls2xlsx python-dotenv==1.0.1 # via webdriver-manager +pytz==2023.4 + # via pandas +pyyaml==6.0.1 + # via xls2xlsx requests==2.31.0 - # via webdriver-manager + # via + # -r requirements.in + # webdriver-manager + # xls2xlsx selenium==4.17.2 # via -r requirements.in +six==1.16.0 + # via python-dateutil sniffio==1.3.0 # via trio sortedcontainers==2.4.0 # via trio +soupsieve==2.5 + # via beautifulsoup4 +texttable==1.7.0 + # via pyexcel trio==0.24.0 # via # selenium @@ -48,12 +105,31 @@ trio-websocket==0.11.1 # via selenium typing-extensions==4.9.0 # via selenium +tzdata==2023.4 + # via pandas urllib3[socks]==2.1.0 # via # requests # selenium # urllib3 +webcolors==1.13 + # via xls2xlsx webdriver-manager==4.0.1 # via -r requirements.in wsproto==1.2.0 # via trio-websocket +xlrd==2.0.1 + # via + # -r requirements.in + # pyexcel-xls + # xls2xlsx + # xlutils +xls2xlsx==0.2.0 + # via -r requirements.in +xlutils==2.0.0 + # via -r requirements.in +xlwt==1.3.0 + # via + # -r requirements.in + # pyexcel-xls + # xlutils diff --git a/storage.py b/storage.py new file mode 100644 index 0000000..25e306c --- /dev/null +++ b/storage.py @@ -0,0 +1,42 @@ +import sqlite3 +from contextlib import contextmanager +from typing import ContextManager + + +class Storage: + def __init__(self): + self.con = sqlite3.connect("database.db") + + @contextmanager + def get_cursor(self) -> ContextManager[sqlite3.Cursor]: + cur = self.con.cursor() + try: + yield cur + finally: + cur.close() + + def create_tables(self): + with self.get_cursor() as cur: + cur.execute(""" + CREATE TABLE IF NOT EXISTS ltl ( + id INTEGER PRIMARY KEY, + doc_filepath TEXT, + link TEXT, + total_cost INTEGER + );""") + + def add_link(self, doc_filepath: str, link: str, total_cost: int): + with self.get_cursor() as cur: + cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)", + (doc_filepath, link, total_cost)) + self.con.commit() + + def is_link_exists(self, link: str) -> bool: + with self.get_cursor() as cur: + res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,)) + return res.fetchone() + + def is_doc_exists(self, doc_filepath: str) -> bool: + with self.get_cursor() as cur: + res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,)) + return res.fetchone()