add parser
parent
68b1f8d039
commit
f69c4377bc
|
@ -0,0 +1,87 @@
|
||||||
|
import pathlib
|
||||||
|
from sqlite3 import Row
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from openpyxl import load_workbook, Workbook
|
||||||
|
from openpyxl.cell import Cell
|
||||||
|
from openpyxl.worksheet.worksheet import Worksheet
|
||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress"
|
||||||
|
TYPE = "1"
|
||||||
|
PICKUP = "1"
|
||||||
|
DELIVERY = "1"
|
||||||
|
USER = "2252130929823409"
|
||||||
|
TOKEN = "67065749269910593"
|
||||||
|
|
||||||
|
|
||||||
|
class ExcelParser:
|
||||||
|
def __init__(self, path: Path | str):
|
||||||
|
self._wb: Workbook | None = None
|
||||||
|
if isinstance(path, str):
|
||||||
|
path = Path(path)
|
||||||
|
|
||||||
|
assert path.is_file() is True
|
||||||
|
self.convert_to_xlsx(path)
|
||||||
|
self._sheet: Worksheet = self._wb.active
|
||||||
|
|
||||||
|
def convert_to_xlsx(self, path: pathlib.Path):
|
||||||
|
import pyexcel as pe
|
||||||
|
if path.suffix == '.xlsx':
|
||||||
|
self._wb = load_workbook(str(path))
|
||||||
|
return
|
||||||
|
pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx')))
|
||||||
|
self._wb = load_workbook(str(path.with_suffix('.xlsx')))
|
||||||
|
|
||||||
|
def clean_up_wb(self) -> pd.DataFrame:
|
||||||
|
triggered = False
|
||||||
|
for row in self._sheet.rows:
|
||||||
|
for cell in row:
|
||||||
|
cell: Cell
|
||||||
|
row: Row
|
||||||
|
if isinstance(cell.value, str) and cell.value.startswith('№ заявки'):
|
||||||
|
self._sheet.delete_rows(1, cell.row - 1)
|
||||||
|
triggered = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if triggered:
|
||||||
|
break
|
||||||
|
|
||||||
|
df = pd.DataFrame(self._sheet.values)
|
||||||
|
not_nullable_cols = df.iloc[0].dropna().index
|
||||||
|
df = df.iloc[:, not_nullable_cols]
|
||||||
|
df.columns = df.iloc[0, :]
|
||||||
|
df: pd.DataFrame = df.drop(0, axis=0)
|
||||||
|
|
||||||
|
df.to_excel('./text.xlsx')
|
||||||
|
|
||||||
|
return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0]
|
||||||
|
|
||||||
|
def calculate(self) -> int | None:
|
||||||
|
df = self.clean_up_wb()
|
||||||
|
query = {
|
||||||
|
"type": TYPE,
|
||||||
|
"token": TOKEN,
|
||||||
|
"delivery": DELIVERY,
|
||||||
|
"pickup": PICKUP,
|
||||||
|
"user": USER,
|
||||||
|
"addr_from": df['Адрес отгрузки'],
|
||||||
|
"addr_to": df['Адрес склада'],
|
||||||
|
"weight": df['Масса'],
|
||||||
|
"volume": df['Объем'],
|
||||||
|
}
|
||||||
|
if query['volume'] is None:
|
||||||
|
query['volume'] = 0.01
|
||||||
|
print(query)
|
||||||
|
data = requests.get(API_CALC_URL, params=query).json()
|
||||||
|
print(data)
|
||||||
|
|
||||||
|
if data.get('price', None) is not None:
|
||||||
|
return int(data['price'])
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ExcelParser('./downloads/1342AWP1A.xls')
|
||||||
|
print(parser.calculate())
|
53
main.py
53
main.py
|
@ -1,15 +1,18 @@
|
||||||
import pathlib
|
import pathlib
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
from typing import Self
|
from typing import Self
|
||||||
|
|
||||||
from selenium.webdriver import Keys
|
from selenium.webdriver import Keys, ActionChains
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.support.wait import WebDriverWait
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
||||||
|
from excel_parser import ExcelParser
|
||||||
|
from storage import Storage
|
||||||
|
|
||||||
|
|
||||||
class Parser:
|
class Parser:
|
||||||
|
@ -23,11 +26,14 @@ class Parser:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
|
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
|
||||||
self._options.add_experimental_option("prefs", prefs)
|
self._options.add_experimental_option("prefs", prefs)
|
||||||
self._options.add_argument("--disable-extensions")
|
# self._options.add_argument("--disable-extensions")
|
||||||
self._options.add_argument("--disable-gpu")
|
# self._options.add_argument("--disable-gpu")
|
||||||
self._options.add_argument("--headless=new")
|
# self._options.add_argument("--headless=new")
|
||||||
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
|
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
|
||||||
|
|
||||||
|
self.storage = Storage()
|
||||||
|
self.storage.create_tables()
|
||||||
|
|
||||||
def __enter__(self) -> Self:
|
def __enter__(self) -> Self:
|
||||||
self._driver = webdriver.Chrome(service=self._service, options=self._options)
|
self._driver = webdriver.Chrome(service=self._service, options=self._options)
|
||||||
return self
|
return self
|
||||||
|
@ -84,28 +90,51 @@ class Parser:
|
||||||
|
|
||||||
# Скачать документацию
|
# Скачать документацию
|
||||||
try:
|
try:
|
||||||
download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
|
download_documentation_button = self.find_elem(By.CSS_SELECTOR,
|
||||||
time.sleep(1)
|
'#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
|
||||||
|
time.sleep(5)
|
||||||
|
ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform()
|
||||||
download_documentation_button.click()
|
download_documentation_button.click()
|
||||||
self.download_documentation()
|
fp = self.download_documentation()
|
||||||
|
|
||||||
|
for file in fp:
|
||||||
|
e_parser = ExcelParser(file)
|
||||||
|
price = e_parser.calculate()
|
||||||
|
if not price:
|
||||||
|
break
|
||||||
|
self.storage.add_link(str(file.absolute()), url, price)
|
||||||
|
|
||||||
except NoSuchElementException:
|
except NoSuchElementException:
|
||||||
self.download_documentation()
|
fp = self.download_documentation()
|
||||||
|
|
||||||
|
for file in fp:
|
||||||
|
e_parser = ExcelParser(file)
|
||||||
|
price = e_parser.calculate()
|
||||||
|
if not price:
|
||||||
|
break
|
||||||
|
self.storage.add_link(str(file.absolute()), url, price)
|
||||||
|
|
||||||
|
def download_documentation(self) -> list[pathlib.Path]:
|
||||||
|
all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
|
||||||
|
|
||||||
def download_documentation(self):
|
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
|
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
|
||||||
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
|
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
href = doc.get_attribute('href')
|
href = doc.get_attribute('href')
|
||||||
if not href.endswith('.xlsx'):
|
if not href.endswith('.xlsx') and not href.endswith('.xls'):
|
||||||
continue
|
continue
|
||||||
self._driver.get(href)
|
self._driver.get(href)
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
|
all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
|
||||||
|
|
||||||
|
fp = all_files_2 - all_files_1
|
||||||
|
|
||||||
|
return [file for file in fp]
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
with Parser() as parser:
|
with Parser() as parser:
|
||||||
parser.login()
|
parser.login()
|
||||||
parser.search()
|
parser.search()
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1,12 @@
|
||||||
selenium
|
selenium
|
||||||
webdriver-manager
|
webdriver-manager
|
||||||
|
openpyxl
|
||||||
|
pandas
|
||||||
|
requests
|
||||||
|
xlrd
|
||||||
|
xlwt
|
||||||
|
xlutils
|
||||||
|
xls2xlsx
|
||||||
|
pyexcel
|
||||||
|
pyexcel-xls
|
||||||
|
pyexcel-xlsx
|
|
@ -8,38 +8,95 @@ attrs==23.2.0
|
||||||
# via
|
# via
|
||||||
# outcome
|
# outcome
|
||||||
# trio
|
# trio
|
||||||
|
beautifulsoup4==4.12.3
|
||||||
|
# via xls2xlsx
|
||||||
certifi==2023.11.17
|
certifi==2023.11.17
|
||||||
# via
|
# via
|
||||||
# requests
|
# requests
|
||||||
# selenium
|
# selenium
|
||||||
|
chardet==5.2.0
|
||||||
|
# via
|
||||||
|
# pyexcel
|
||||||
|
# xls2xlsx
|
||||||
charset-normalizer==3.3.2
|
charset-normalizer==3.3.2
|
||||||
# via requests
|
# via requests
|
||||||
|
cssutils==2.9.0
|
||||||
|
# via xls2xlsx
|
||||||
|
currency-symbols==2.0.3
|
||||||
|
# via xls2xlsx
|
||||||
|
et-xmlfile==1.1.0
|
||||||
|
# via openpyxl
|
||||||
exceptiongroup==1.2.0
|
exceptiongroup==1.2.0
|
||||||
# via
|
# via
|
||||||
# trio
|
# trio
|
||||||
# trio-websocket
|
# trio-websocket
|
||||||
|
fonttools==4.47.2
|
||||||
|
# via xls2xlsx
|
||||||
h11==0.14.0
|
h11==0.14.0
|
||||||
# via wsproto
|
# via wsproto
|
||||||
idna==3.6
|
idna==3.6
|
||||||
# via
|
# via
|
||||||
# requests
|
# requests
|
||||||
# trio
|
# trio
|
||||||
|
lml==0.1.0
|
||||||
|
# via
|
||||||
|
# pyexcel
|
||||||
|
# pyexcel-io
|
||||||
|
numpy==1.26.3
|
||||||
|
# via pandas
|
||||||
|
openpyxl==3.1.2
|
||||||
|
# via
|
||||||
|
# -r requirements.in
|
||||||
|
# pyexcel-xlsx
|
||||||
|
# xls2xlsx
|
||||||
outcome==1.3.0.post0
|
outcome==1.3.0.post0
|
||||||
# via trio
|
# via trio
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
# via webdriver-manager
|
# via webdriver-manager
|
||||||
|
pandas==2.2.0
|
||||||
|
# via -r requirements.in
|
||||||
|
pillow==10.2.0
|
||||||
|
# via xls2xlsx
|
||||||
|
pyexcel==0.7.0
|
||||||
|
# via -r requirements.in
|
||||||
|
pyexcel-io==0.6.6
|
||||||
|
# via
|
||||||
|
# pyexcel
|
||||||
|
# pyexcel-xls
|
||||||
|
# pyexcel-xlsx
|
||||||
|
pyexcel-xls==0.7.0
|
||||||
|
# via -r requirements.in
|
||||||
|
pyexcel-xlsx==0.6.0
|
||||||
|
# via -r requirements.in
|
||||||
pysocks==1.7.1
|
pysocks==1.7.1
|
||||||
# via urllib3
|
# via urllib3
|
||||||
|
python-dateutil==2.8.2
|
||||||
|
# via
|
||||||
|
# pandas
|
||||||
|
# xls2xlsx
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
# via webdriver-manager
|
# via webdriver-manager
|
||||||
|
pytz==2023.4
|
||||||
|
# via pandas
|
||||||
|
pyyaml==6.0.1
|
||||||
|
# via xls2xlsx
|
||||||
requests==2.31.0
|
requests==2.31.0
|
||||||
# via webdriver-manager
|
# via
|
||||||
|
# -r requirements.in
|
||||||
|
# webdriver-manager
|
||||||
|
# xls2xlsx
|
||||||
selenium==4.17.2
|
selenium==4.17.2
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
|
six==1.16.0
|
||||||
|
# via python-dateutil
|
||||||
sniffio==1.3.0
|
sniffio==1.3.0
|
||||||
# via trio
|
# via trio
|
||||||
sortedcontainers==2.4.0
|
sortedcontainers==2.4.0
|
||||||
# via trio
|
# via trio
|
||||||
|
soupsieve==2.5
|
||||||
|
# via beautifulsoup4
|
||||||
|
texttable==1.7.0
|
||||||
|
# via pyexcel
|
||||||
trio==0.24.0
|
trio==0.24.0
|
||||||
# via
|
# via
|
||||||
# selenium
|
# selenium
|
||||||
|
@ -48,12 +105,31 @@ trio-websocket==0.11.1
|
||||||
# via selenium
|
# via selenium
|
||||||
typing-extensions==4.9.0
|
typing-extensions==4.9.0
|
||||||
# via selenium
|
# via selenium
|
||||||
|
tzdata==2023.4
|
||||||
|
# via pandas
|
||||||
urllib3[socks]==2.1.0
|
urllib3[socks]==2.1.0
|
||||||
# via
|
# via
|
||||||
# requests
|
# requests
|
||||||
# selenium
|
# selenium
|
||||||
# urllib3
|
# urllib3
|
||||||
|
webcolors==1.13
|
||||||
|
# via xls2xlsx
|
||||||
webdriver-manager==4.0.1
|
webdriver-manager==4.0.1
|
||||||
# via -r requirements.in
|
# via -r requirements.in
|
||||||
wsproto==1.2.0
|
wsproto==1.2.0
|
||||||
# via trio-websocket
|
# via trio-websocket
|
||||||
|
xlrd==2.0.1
|
||||||
|
# via
|
||||||
|
# -r requirements.in
|
||||||
|
# pyexcel-xls
|
||||||
|
# xls2xlsx
|
||||||
|
# xlutils
|
||||||
|
xls2xlsx==0.2.0
|
||||||
|
# via -r requirements.in
|
||||||
|
xlutils==2.0.0
|
||||||
|
# via -r requirements.in
|
||||||
|
xlwt==1.3.0
|
||||||
|
# via
|
||||||
|
# -r requirements.in
|
||||||
|
# pyexcel-xls
|
||||||
|
# xlutils
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
import sqlite3
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import ContextManager
|
||||||
|
|
||||||
|
|
||||||
|
class Storage:
|
||||||
|
def __init__(self):
|
||||||
|
self.con = sqlite3.connect("database.db")
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def get_cursor(self) -> ContextManager[sqlite3.Cursor]:
|
||||||
|
cur = self.con.cursor()
|
||||||
|
try:
|
||||||
|
yield cur
|
||||||
|
finally:
|
||||||
|
cur.close()
|
||||||
|
|
||||||
|
def create_tables(self):
|
||||||
|
with self.get_cursor() as cur:
|
||||||
|
cur.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS ltl (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
doc_filepath TEXT,
|
||||||
|
link TEXT,
|
||||||
|
total_cost INTEGER
|
||||||
|
);""")
|
||||||
|
|
||||||
|
def add_link(self, doc_filepath: str, link: str, total_cost: int):
|
||||||
|
with self.get_cursor() as cur:
|
||||||
|
cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)",
|
||||||
|
(doc_filepath, link, total_cost))
|
||||||
|
self.con.commit()
|
||||||
|
|
||||||
|
def is_link_exists(self, link: str) -> bool:
|
||||||
|
with self.get_cursor() as cur:
|
||||||
|
res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,))
|
||||||
|
return res.fetchone()
|
||||||
|
|
||||||
|
def is_doc_exists(self, doc_filepath: str) -> bool:
|
||||||
|
with self.get_cursor() as cur:
|
||||||
|
res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,))
|
||||||
|
return res.fetchone()
|
Loading…
Reference in New Issue