add parser

master
Ernest Litvinenko 2024-02-06 10:47:22 +03:00
parent 68b1f8d039
commit f69c4377bc
5 changed files with 258 additions and 14 deletions

87
excel_parser.py Normal file
View File

@ -0,0 +1,87 @@
import pathlib
from sqlite3 import Row
import requests
from openpyxl import load_workbook, Workbook
from openpyxl.cell import Cell
from openpyxl.worksheet.worksheet import Worksheet
from pathlib import Path
import pandas as pd
API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress"
TYPE = "1"
PICKUP = "1"
DELIVERY = "1"
USER = "2252130929823409"
TOKEN = "67065749269910593"
class ExcelParser:
def __init__(self, path: Path | str):
self._wb: Workbook | None = None
if isinstance(path, str):
path = Path(path)
assert path.is_file() is True
self.convert_to_xlsx(path)
self._sheet: Worksheet = self._wb.active
def convert_to_xlsx(self, path: pathlib.Path):
import pyexcel as pe
if path.suffix == '.xlsx':
self._wb = load_workbook(str(path))
return
pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx')))
self._wb = load_workbook(str(path.with_suffix('.xlsx')))
def clean_up_wb(self) -> pd.DataFrame:
triggered = False
for row in self._sheet.rows:
for cell in row:
cell: Cell
row: Row
if isinstance(cell.value, str) and cell.value.startswith('№ заявки'):
self._sheet.delete_rows(1, cell.row - 1)
triggered = True
break
if triggered:
break
df = pd.DataFrame(self._sheet.values)
not_nullable_cols = df.iloc[0].dropna().index
df = df.iloc[:, not_nullable_cols]
df.columns = df.iloc[0, :]
df: pd.DataFrame = df.drop(0, axis=0)
df.to_excel('./text.xlsx')
return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0]
def calculate(self) -> int | None:
df = self.clean_up_wb()
query = {
"type": TYPE,
"token": TOKEN,
"delivery": DELIVERY,
"pickup": PICKUP,
"user": USER,
"addr_from": df['Адрес отгрузки'],
"addr_to": df['Адрес склада'],
"weight": df['Масса'],
"volume": df['Объем'],
}
if query['volume'] is None:
query['volume'] = 0.01
print(query)
data = requests.get(API_CALC_URL, params=query).json()
print(data)
if data.get('price', None) is not None:
return int(data['price'])
return None
if __name__ == '__main__':
parser = ExcelParser('./downloads/1342AWP1A.xls')
print(parser.calculate())

51
main.py
View File

@ -1,15 +1,18 @@
import pathlib import pathlib
import os
import time import time
import re import re
from typing import Self from typing import Self
from selenium.webdriver import Keys from selenium.webdriver import Keys, ActionChains
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException from selenium.common.exceptions import NoSuchElementException, TimeoutException
from excel_parser import ExcelParser
from storage import Storage
class Parser: class Parser:
@ -23,11 +26,14 @@ class Parser:
def __init__(self): def __init__(self):
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())} prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
self._options.add_experimental_option("prefs", prefs) self._options.add_experimental_option("prefs", prefs)
self._options.add_argument("--disable-extensions") # self._options.add_argument("--disable-extensions")
self._options.add_argument("--disable-gpu") # self._options.add_argument("--disable-gpu")
self._options.add_argument("--headless=new") # self._options.add_argument("--headless=new")
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install()) self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
self.storage = Storage()
self.storage.create_tables()
def __enter__(self) -> Self: def __enter__(self) -> Self:
self._driver = webdriver.Chrome(service=self._service, options=self._options) self._driver = webdriver.Chrome(service=self._service, options=self._options)
return self return self
@ -84,28 +90,51 @@ class Parser:
# Скачать документацию # Скачать документацию
try: try:
download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]') download_documentation_button = self.find_elem(By.CSS_SELECTOR,
time.sleep(1) '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
time.sleep(5)
ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform()
download_documentation_button.click() download_documentation_button.click()
self.download_documentation() fp = self.download_documentation()
for file in fp:
e_parser = ExcelParser(file)
price = e_parser.calculate()
if not price:
break
self.storage.add_link(str(file.absolute()), url, price)
except NoSuchElementException: except NoSuchElementException:
self.download_documentation() fp = self.download_documentation()
for file in fp:
e_parser = ExcelParser(file)
price = e_parser.calculate()
if not price:
break
self.storage.add_link(str(file.absolute()), url, price)
def download_documentation(self) -> list[pathlib.Path]:
all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
def download_documentation(self):
time.sleep(5) time.sleep(5)
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation') documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a') docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
for doc in docs: for doc in docs:
href = doc.get_attribute('href') href = doc.get_attribute('href')
if not href.endswith('.xlsx'): if not href.endswith('.xlsx') and not href.endswith('.xls'):
continue continue
self._driver.get(href) self._driver.get(href)
time.sleep(3) time.sleep(3)
all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
fp = all_files_2 - all_files_1
return [file for file in fp]
if __name__ == "__main__": if __name__ == "__main__":
with Parser() as parser: with Parser() as parser:
parser.login() parser.login()
parser.search() parser.search()

View File

@ -1,2 +1,12 @@
selenium selenium
webdriver-manager webdriver-manager
openpyxl
pandas
requests
xlrd
xlwt
xlutils
xls2xlsx
pyexcel
pyexcel-xls
pyexcel-xlsx

View File

@ -8,38 +8,95 @@ attrs==23.2.0
# via # via
# outcome # outcome
# trio # trio
beautifulsoup4==4.12.3
# via xls2xlsx
certifi==2023.11.17 certifi==2023.11.17
# via # via
# requests # requests
# selenium # selenium
chardet==5.2.0
# via
# pyexcel
# xls2xlsx
charset-normalizer==3.3.2 charset-normalizer==3.3.2
# via requests # via requests
cssutils==2.9.0
# via xls2xlsx
currency-symbols==2.0.3
# via xls2xlsx
et-xmlfile==1.1.0
# via openpyxl
exceptiongroup==1.2.0 exceptiongroup==1.2.0
# via # via
# trio # trio
# trio-websocket # trio-websocket
fonttools==4.47.2
# via xls2xlsx
h11==0.14.0 h11==0.14.0
# via wsproto # via wsproto
idna==3.6 idna==3.6
# via # via
# requests # requests
# trio # trio
lml==0.1.0
# via
# pyexcel
# pyexcel-io
numpy==1.26.3
# via pandas
openpyxl==3.1.2
# via
# -r requirements.in
# pyexcel-xlsx
# xls2xlsx
outcome==1.3.0.post0 outcome==1.3.0.post0
# via trio # via trio
packaging==23.2 packaging==23.2
# via webdriver-manager # via webdriver-manager
pandas==2.2.0
# via -r requirements.in
pillow==10.2.0
# via xls2xlsx
pyexcel==0.7.0
# via -r requirements.in
pyexcel-io==0.6.6
# via
# pyexcel
# pyexcel-xls
# pyexcel-xlsx
pyexcel-xls==0.7.0
# via -r requirements.in
pyexcel-xlsx==0.6.0
# via -r requirements.in
pysocks==1.7.1 pysocks==1.7.1
# via urllib3 # via urllib3
python-dateutil==2.8.2
# via
# pandas
# xls2xlsx
python-dotenv==1.0.1 python-dotenv==1.0.1
# via webdriver-manager # via webdriver-manager
pytz==2023.4
# via pandas
pyyaml==6.0.1
# via xls2xlsx
requests==2.31.0 requests==2.31.0
# via webdriver-manager # via
# -r requirements.in
# webdriver-manager
# xls2xlsx
selenium==4.17.2 selenium==4.17.2
# via -r requirements.in # via -r requirements.in
six==1.16.0
# via python-dateutil
sniffio==1.3.0 sniffio==1.3.0
# via trio # via trio
sortedcontainers==2.4.0 sortedcontainers==2.4.0
# via trio # via trio
soupsieve==2.5
# via beautifulsoup4
texttable==1.7.0
# via pyexcel
trio==0.24.0 trio==0.24.0
# via # via
# selenium # selenium
@ -48,12 +105,31 @@ trio-websocket==0.11.1
# via selenium # via selenium
typing-extensions==4.9.0 typing-extensions==4.9.0
# via selenium # via selenium
tzdata==2023.4
# via pandas
urllib3[socks]==2.1.0 urllib3[socks]==2.1.0
# via # via
# requests # requests
# selenium # selenium
# urllib3 # urllib3
webcolors==1.13
# via xls2xlsx
webdriver-manager==4.0.1 webdriver-manager==4.0.1
# via -r requirements.in # via -r requirements.in
wsproto==1.2.0 wsproto==1.2.0
# via trio-websocket # via trio-websocket
xlrd==2.0.1
# via
# -r requirements.in
# pyexcel-xls
# xls2xlsx
# xlutils
xls2xlsx==0.2.0
# via -r requirements.in
xlutils==2.0.0
# via -r requirements.in
xlwt==1.3.0
# via
# -r requirements.in
# pyexcel-xls
# xlutils

42
storage.py Normal file
View File

@ -0,0 +1,42 @@
import sqlite3
from contextlib import contextmanager
from typing import ContextManager
class Storage:
def __init__(self):
self.con = sqlite3.connect("database.db")
@contextmanager
def get_cursor(self) -> ContextManager[sqlite3.Cursor]:
cur = self.con.cursor()
try:
yield cur
finally:
cur.close()
def create_tables(self):
with self.get_cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS ltl (
id INTEGER PRIMARY KEY,
doc_filepath TEXT,
link TEXT,
total_cost INTEGER
);""")
def add_link(self, doc_filepath: str, link: str, total_cost: int):
with self.get_cursor() as cur:
cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)",
(doc_filepath, link, total_cost))
self.con.commit()
def is_link_exists(self, link: str) -> bool:
with self.get_cursor() as cur:
res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,))
return res.fetchone()
def is_doc_exists(self, doc_filepath: str) -> bool:
with self.get_cursor() as cur:
res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,))
return res.fetchone()