add parser

2024-02-06 10:47:22 +03:00 · 2024-02-06 10:47:22 +03:00 · f69c4377bc
parent 68b1f8d039
commit f69c4377bc
5 changed files with 258 additions and 14 deletions
--- a/excel_parser.py
+++ b/excel_parser.py
@ -0,0 +1,87 @@
+import pathlib
+from sqlite3 import Row
+
+import requests
+from openpyxl import load_workbook, Workbook
+from openpyxl.cell import Cell
+from openpyxl.worksheet.worksheet import Worksheet
+from pathlib import Path
+import pandas as pd
+
+API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress"
+TYPE = "1"
+PICKUP = "1"
+DELIVERY = "1"
+USER = "2252130929823409"
+TOKEN = "67065749269910593"
+
+
+class ExcelParser:
+    def __init__(self, path: Path | str):
+        self._wb: Workbook | None = None
+        if isinstance(path, str):
+            path = Path(path)
+
+        assert path.is_file() is True
+        self.convert_to_xlsx(path)
+        self._sheet: Worksheet = self._wb.active
+
+    def convert_to_xlsx(self, path: pathlib.Path):
+        import pyexcel as pe
+        if path.suffix == '.xlsx':
+            self._wb = load_workbook(str(path))
+            return
+        pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx')))
+        self._wb = load_workbook(str(path.with_suffix('.xlsx')))
+
+    def clean_up_wb(self) -> pd.DataFrame:
+        triggered = False
+        for row in self._sheet.rows:
+            for cell in row:
+                cell: Cell
+                row: Row
+                if isinstance(cell.value, str) and cell.value.startswith('№ заявки'):
+                    self._sheet.delete_rows(1, cell.row - 1)
+                    triggered = True
+                    break
+
+            if triggered:
+                break
+
+        df = pd.DataFrame(self._sheet.values)
+        not_nullable_cols = df.iloc[0].dropna().index
+        df = df.iloc[:, not_nullable_cols]
+        df.columns = df.iloc[0, :]
+        df: pd.DataFrame = df.drop(0, axis=0)
+
+        df.to_excel('./text.xlsx')
+
+        return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0]
+
+    def calculate(self) -> int | None:
+        df = self.clean_up_wb()
+        query = {
+            "type": TYPE,
+            "token": TOKEN,
+            "delivery": DELIVERY,
+            "pickup": PICKUP,
+            "user": USER,
+            "addr_from": df['Адрес отгрузки'],
+            "addr_to": df['Адрес склада'],
+            "weight": df['Масса'],
+            "volume": df['Объем'],
+        }
+        if query['volume'] is None:
+            query['volume'] = 0.01
+        print(query)
+        data = requests.get(API_CALC_URL, params=query).json()
+        print(data)
+
+        if data.get('price', None) is not None:
+            return int(data['price'])
+        return None
+
+
+if __name__ == '__main__':
+    parser = ExcelParser('./downloads/1342AWP1A.xls')
+    print(parser.calculate())
--- a/main.py
+++ b/main.py
@ -1,15 +1,18 @@
 import pathlib
+import os
 import time
 import re
 from typing import Self

-from selenium.webdriver import Keys
+from selenium.webdriver import Keys, ActionChains
 from selenium.webdriver.common.by import By
-from  webdriver_manager.chrome import ChromeDriverManager
+from webdriver_manager.chrome import ChromeDriverManager
 from selenium import webdriver
 from selenium.webdriver.support.wait import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import NoSuchElementException, TimeoutException
+from excel_parser import ExcelParser
+from storage import Storage


 class Parser:
@ -23,11 +26,14 @@ class Parser:
    def __init__(self):
        prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
        self._options.add_experimental_option("prefs", prefs)
-        self._options.add_argument("--disable-extensions")
-        self._options.add_argument("--disable-gpu")
-        self._options.add_argument("--headless=new")
+        # self._options.add_argument("--disable-extensions")
+        # self._options.add_argument("--disable-gpu")
+        # self._options.add_argument("--headless=new")
        self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())

+        self.storage = Storage()
+        self.storage.create_tables()
+
    def __enter__(self) -> Self:
        self._driver = webdriver.Chrome(service=self._service, options=self._options)
        return self
@ -84,28 +90,51 @@ class Parser:

        # Скачать документацию
        try:
-            download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
-            time.sleep(1)
+            download_documentation_button = self.find_elem(By.CSS_SELECTOR,
+                                                           '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
+            time.sleep(5)
+            ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform()
            download_documentation_button.click()
-            self.download_documentation()
+            fp = self.download_documentation()
+
+            for file in fp:
+                e_parser = ExcelParser(file)
+                price = e_parser.calculate()
+                if not price:
+                    break
+                self.storage.add_link(str(file.absolute()), url, price)

        except NoSuchElementException:
-            self.download_documentation()
+            fp = self.download_documentation()
+
+            for file in fp:
+                e_parser = ExcelParser(file)
+                price = e_parser.calculate()
+                if not price:
+                    break
+                self.storage.add_link(str(file.absolute()), url, price)
+
+    def download_documentation(self) -> list[pathlib.Path]:
+        all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])

-    def download_documentation(self):
        time.sleep(5)
        documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
        docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
        for doc in docs:
            href = doc.get_attribute('href')
-            if not href.endswith('.xlsx'):
+            if not href.endswith('.xlsx') and not href.endswith('.xls'):
                continue
            self._driver.get(href)
            time.sleep(3)

+        all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
+
+        fp = all_files_2 - all_files_1
+
+        return [file for file in fp]
+

 if __name__ == "__main__":
    with Parser() as parser:
        parser.login()
        parser.search()
-
--- a/requirements.in
+++ b/requirements.in
@ -1,2 +1,12 @@
 selenium
 webdriver-manager
+openpyxl
+pandas
+requests
+xlrd
+xlwt
+xlutils
+xls2xlsx
+pyexcel
+pyexcel-xls
+pyexcel-xlsx
--- a/requirements.txt
+++ b/requirements.txt
@ -8,38 +8,95 @@ attrs==23.2.0
    # via
    #   outcome
    #   trio
+beautifulsoup4==4.12.3
+    # via xls2xlsx
 certifi==2023.11.17
    # via
    #   requests
    #   selenium
+chardet==5.2.0
+    # via
+    #   pyexcel
+    #   xls2xlsx
 charset-normalizer==3.3.2
    # via requests
+cssutils==2.9.0
+    # via xls2xlsx
+currency-symbols==2.0.3
+    # via xls2xlsx
+et-xmlfile==1.1.0
+    # via openpyxl
 exceptiongroup==1.2.0
    # via
    #   trio
    #   trio-websocket
+fonttools==4.47.2
+    # via xls2xlsx
 h11==0.14.0
    # via wsproto
 idna==3.6
    # via
    #   requests
    #   trio
+lml==0.1.0
+    # via
+    #   pyexcel
+    #   pyexcel-io
+numpy==1.26.3
+    # via pandas
+openpyxl==3.1.2
+    # via
+    #   -r requirements.in
+    #   pyexcel-xlsx
+    #   xls2xlsx
 outcome==1.3.0.post0
    # via trio
 packaging==23.2
    # via webdriver-manager
+pandas==2.2.0
+    # via -r requirements.in
+pillow==10.2.0
+    # via xls2xlsx
+pyexcel==0.7.0
+    # via -r requirements.in
+pyexcel-io==0.6.6
+    # via
+    #   pyexcel
+    #   pyexcel-xls
+    #   pyexcel-xlsx
+pyexcel-xls==0.7.0
+    # via -r requirements.in
+pyexcel-xlsx==0.6.0
+    # via -r requirements.in
 pysocks==1.7.1
    # via urllib3
+python-dateutil==2.8.2
+    # via
+    #   pandas
+    #   xls2xlsx
 python-dotenv==1.0.1
    # via webdriver-manager
+pytz==2023.4
+    # via pandas
+pyyaml==6.0.1
+    # via xls2xlsx
 requests==2.31.0
-    # via webdriver-manager
+    # via
+    #   -r requirements.in
+    #   webdriver-manager
+    #   xls2xlsx
 selenium==4.17.2
    # via -r requirements.in
+six==1.16.0
+    # via python-dateutil
 sniffio==1.3.0
    # via trio
 sortedcontainers==2.4.0
    # via trio
+soupsieve==2.5
+    # via beautifulsoup4
+texttable==1.7.0
+    # via pyexcel
 trio==0.24.0
    # via
    #   selenium
@ -48,12 +105,31 @@ trio-websocket==0.11.1
    # via selenium
 typing-extensions==4.9.0
    # via selenium
+tzdata==2023.4
+    # via pandas
 urllib3[socks]==2.1.0
    # via
    #   requests
    #   selenium
    #   urllib3
+webcolors==1.13
+    # via xls2xlsx
 webdriver-manager==4.0.1
    # via -r requirements.in
 wsproto==1.2.0
    # via trio-websocket
+xlrd==2.0.1
+    # via
+    #   -r requirements.in
+    #   pyexcel-xls
+    #   xls2xlsx
+    #   xlutils
+xls2xlsx==0.2.0
+    # via -r requirements.in
+xlutils==2.0.0
+    # via -r requirements.in
+xlwt==1.3.0
+    # via
+    #   -r requirements.in
+    #   pyexcel-xls
+    #   xlutils
--- a/storage.py
+++ b/storage.py
@ -0,0 +1,42 @@
+import sqlite3
+from contextlib import contextmanager
+from typing import ContextManager
+
+
+class Storage:
+    def __init__(self):
+        self.con = sqlite3.connect("database.db")
+
+    @contextmanager
+    def get_cursor(self) -> ContextManager[sqlite3.Cursor]:
+        cur = self.con.cursor()
+        try:
+            yield cur
+        finally:
+            cur.close()
+
+    def create_tables(self):
+        with self.get_cursor() as cur:
+            cur.execute("""
+            CREATE TABLE IF NOT EXISTS ltl (
+                id INTEGER PRIMARY KEY,
+                doc_filepath TEXT,
+                link TEXT,
+                total_cost INTEGER
+            );""")
+
+    def add_link(self, doc_filepath: str, link: str, total_cost: int):
+        with self.get_cursor() as cur:
+            cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)",
+                        (doc_filepath, link, total_cost))
+            self.con.commit()
+
+    def is_link_exists(self, link: str) -> bool:
+        with self.get_cursor() as cur:
+            res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,))
+            return res.fetchone()
+
+    def is_doc_exists(self, doc_filepath: str) -> bool:
+        with self.get_cursor() as cur:
+            res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,))
+            return res.fetchone()