add parser

2024-02-06 10:47:22 +03:00 · 2024-02-06 10:47:22 +03:00 · f69c4377bc
parent 68b1f8d039
commit f69c4377bc
5 changed files with 258 additions and 14 deletions
--- a/excel_parser.py
+++ b/excel_parser.py
@ -0,0 +1,87 @@
 import pathlib
 from sqlite3 import Row
 import requests
 from openpyxl import load_workbook, Workbook
 from openpyxl.cell import Cell
 from openpyxl.worksheet.worksheet import Worksheet
 from pathlib import Path
 import pandas as pd
 API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress"
 TYPE = "1"
 PICKUP = "1"
 DELIVERY = "1"
 USER = "2252130929823409"
 TOKEN = "67065749269910593"
 class ExcelParser:
    def __init__(self, path: Path | str):
        self._wb: Workbook | None = None
        if isinstance(path, str):
            path = Path(path)
        assert path.is_file() is True
        self.convert_to_xlsx(path)
        self._sheet: Worksheet = self._wb.active
    def convert_to_xlsx(self, path: pathlib.Path):
        import pyexcel as pe
        if path.suffix == '.xlsx':
            self._wb = load_workbook(str(path))
            return
        pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx')))
        self._wb = load_workbook(str(path.with_suffix('.xlsx')))
    def clean_up_wb(self) -> pd.DataFrame:
        triggered = False
        for row in self._sheet.rows:
            for cell in row:
                cell: Cell
                row: Row
                if isinstance(cell.value, str) and cell.value.startswith('№ заявки'):
                    self._sheet.delete_rows(1, cell.row - 1)
                    triggered = True
                    break
            if triggered:
                break
        df = pd.DataFrame(self._sheet.values)
        not_nullable_cols = df.iloc[0].dropna().index
        df = df.iloc[:, not_nullable_cols]
        df.columns = df.iloc[0, :]
        df: pd.DataFrame = df.drop(0, axis=0)
        df.to_excel('./text.xlsx')
        return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0]
    def calculate(self) -> int | None:
        df = self.clean_up_wb()
        query = {
            "type": TYPE,
            "token": TOKEN,
            "delivery": DELIVERY,
            "pickup": PICKUP,
            "user": USER,
            "addr_from": df['Адрес отгрузки'],
            "addr_to": df['Адрес склада'],
            "weight": df['Масса'],
            "volume": df['Объем'],
        }
        if query['volume'] is None:
            query['volume'] = 0.01
        print(query)
        data = requests.get(API_CALC_URL, params=query).json()
        print(data)
        if data.get('price', None) is not None:
            return int(data['price'])
        return None
 if __name__ == '__main__':
    parser = ExcelParser('./downloads/1342AWP1A.xls')
    print(parser.calculate())
--- a/main.py
+++ b/main.py
@ -1,15 +1,18 @@
 import pathlib
 import os
 import time
 import re
 from typing import Self
-from selenium.webdriver import Keys
+from selenium.webdriver import Keys, ActionChains
 from selenium.webdriver.common.by import By
 from webdriver_manager.chrome import ChromeDriverManager
 from selenium import webdriver
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import NoSuchElementException, TimeoutException
 from excel_parser import ExcelParser
 from storage import Storage
 class Parser:
@ -23,11 +26,14 @@ class Parser:
    def __init__(self):
        prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
        self._options.add_experimental_option("prefs", prefs)
-        self._options.add_argument("--disable-extensions")
+        # self._options.add_argument("--disable-extensions")
-        self._options.add_argument("--disable-gpu")
+        # self._options.add_argument("--disable-gpu")
-        self._options.add_argument("--headless=new")
+        # self._options.add_argument("--headless=new")
        self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
        self.storage = Storage()
        self.storage.create_tables()
    def __enter__(self) -> Self:
        self._driver = webdriver.Chrome(service=self._service, options=self._options)
        return self
@ -84,28 +90,51 @@ class Parser:
        # Скачать документацию
        try:
-            download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
+            download_documentation_button = self.find_elem(By.CSS_SELECTOR,
-            time.sleep(1)
+                                                           '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
            time.sleep(5)
            ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform()
            download_documentation_button.click()
-            self.download_documentation()
+            fp = self.download_documentation()
            for file in fp:
                e_parser = ExcelParser(file)
                price = e_parser.calculate()
                if not price:
                    break
                self.storage.add_link(str(file.absolute()), url, price)
        except NoSuchElementException:
-            self.download_documentation()
+            fp = self.download_documentation()
            for file in fp:
                e_parser = ExcelParser(file)
                price = e_parser.calculate()
                if not price:
                    break
                self.storage.add_link(str(file.absolute()), url, price)
    def download_documentation(self) -> list[pathlib.Path]:
        all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
    def download_documentation(self):
        time.sleep(5)
        documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
        docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
        for doc in docs:
            href = doc.get_attribute('href')
-            if not href.endswith('.xlsx'):
+            if not href.endswith('.xlsx') and not href.endswith('.xls'):
                continue
            self._driver.get(href)
            time.sleep(3)
        all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
        fp = all_files_2 - all_files_1
        return [file for file in fp]
 if __name__ == "__main__":
    with Parser() as parser:
        parser.login()
        parser.search()
--- a/requirements.in
+++ b/requirements.in
@ -1,2 +1,12 @@
 selenium
 webdriver-manager
 openpyxl
 pandas
 requests
 xlrd
 xlwt
 xlutils
 xls2xlsx
 pyexcel
 pyexcel-xls
 pyexcel-xlsx
--- a/requirements.txt
+++ b/requirements.txt
@ -8,38 +8,95 @@ attrs==23.2.0
    # via
    #   outcome
    #   trio
 beautifulsoup4==4.12.3
    # via xls2xlsx
 certifi==2023.11.17
    # via
    #   requests
    #   selenium
 chardet==5.2.0
    # via
    #   pyexcel
    #   xls2xlsx
 charset-normalizer==3.3.2
    # via requests
 cssutils==2.9.0
    # via xls2xlsx
 currency-symbols==2.0.3
    # via xls2xlsx
 et-xmlfile==1.1.0
    # via openpyxl
 exceptiongroup==1.2.0
    # via
    #   trio
    #   trio-websocket
 fonttools==4.47.2
    # via xls2xlsx
 h11==0.14.0
    # via wsproto
 idna==3.6
    # via
    #   requests
    #   trio
 lml==0.1.0
    # via
    #   pyexcel
    #   pyexcel-io
 numpy==1.26.3
    # via pandas
 openpyxl==3.1.2
    # via
    #   -r requirements.in
    #   pyexcel-xlsx
    #   xls2xlsx
 outcome==1.3.0.post0
    # via trio
 packaging==23.2
    # via webdriver-manager
 pandas==2.2.0
    # via -r requirements.in
 pillow==10.2.0
    # via xls2xlsx
 pyexcel==0.7.0
    # via -r requirements.in
 pyexcel-io==0.6.6
    # via
    #   pyexcel
    #   pyexcel-xls
    #   pyexcel-xlsx
 pyexcel-xls==0.7.0
    # via -r requirements.in
 pyexcel-xlsx==0.6.0
    # via -r requirements.in
 pysocks==1.7.1
    # via urllib3
 python-dateutil==2.8.2
    # via
    #   pandas
    #   xls2xlsx
 python-dotenv==1.0.1
    # via webdriver-manager
 pytz==2023.4
    # via pandas
 pyyaml==6.0.1
    # via xls2xlsx
 requests==2.31.0
-    # via webdriver-manager
+    # via
    #   -r requirements.in
    #   webdriver-manager
    #   xls2xlsx
 selenium==4.17.2
    # via -r requirements.in
 six==1.16.0
    # via python-dateutil
 sniffio==1.3.0
    # via trio
 sortedcontainers==2.4.0
    # via trio
 soupsieve==2.5
    # via beautifulsoup4
 texttable==1.7.0
    # via pyexcel
 trio==0.24.0
    # via
    #   selenium
@ -48,12 +105,31 @@ trio-websocket==0.11.1
    # via selenium
 typing-extensions==4.9.0
    # via selenium
 tzdata==2023.4
    # via pandas
 urllib3[socks]==2.1.0
    # via
    #   requests
    #   selenium
    #   urllib3
 webcolors==1.13
    # via xls2xlsx
 webdriver-manager==4.0.1
    # via -r requirements.in
 wsproto==1.2.0
    # via trio-websocket
 xlrd==2.0.1
    # via
    #   -r requirements.in
    #   pyexcel-xls
    #   xls2xlsx
    #   xlutils
 xls2xlsx==0.2.0
    # via -r requirements.in
 xlutils==2.0.0
    # via -r requirements.in
 xlwt==1.3.0
    # via
    #   -r requirements.in
    #   pyexcel-xls
    #   xlutils
--- a/storage.py
+++ b/storage.py
@ -0,0 +1,42 @@
 import sqlite3
 from contextlib import contextmanager
 from typing import ContextManager
 class Storage:
    def __init__(self):
        self.con = sqlite3.connect("database.db")
    @contextmanager
    def get_cursor(self) -> ContextManager[sqlite3.Cursor]:
        cur = self.con.cursor()
        try:
            yield cur
        finally:
            cur.close()
    def create_tables(self):
        with self.get_cursor() as cur:
            cur.execute("""
            CREATE TABLE IF NOT EXISTS ltl (
                id INTEGER PRIMARY KEY,
                doc_filepath TEXT,
                link TEXT,
                total_cost INTEGER
            );""")
    def add_link(self, doc_filepath: str, link: str, total_cost: int):
        with self.get_cursor() as cur:
            cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)",
                        (doc_filepath, link, total_cost))
            self.con.commit()
    def is_link_exists(self, link: str) -> bool:
        with self.get_cursor() as cur:
            res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,))
            return res.fetchone()
    def is_doc_exists(self, doc_filepath: str) -> bool:
        with self.get_cursor() as cur:
            res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,))
            return res.fetchone()