add parser
parent
68b1f8d039
commit
f69c4377bc
|
@ -0,0 +1,87 @@
|
|||
import pathlib
|
||||
from sqlite3 import Row
|
||||
|
||||
import requests
|
||||
from openpyxl import load_workbook, Workbook
|
||||
from openpyxl.cell import Cell
|
||||
from openpyxl.worksheet.worksheet import Worksheet
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress"
|
||||
TYPE = "1"
|
||||
PICKUP = "1"
|
||||
DELIVERY = "1"
|
||||
USER = "2252130929823409"
|
||||
TOKEN = "67065749269910593"
|
||||
|
||||
|
||||
class ExcelParser:
|
||||
def __init__(self, path: Path | str):
|
||||
self._wb: Workbook | None = None
|
||||
if isinstance(path, str):
|
||||
path = Path(path)
|
||||
|
||||
assert path.is_file() is True
|
||||
self.convert_to_xlsx(path)
|
||||
self._sheet: Worksheet = self._wb.active
|
||||
|
||||
def convert_to_xlsx(self, path: pathlib.Path):
|
||||
import pyexcel as pe
|
||||
if path.suffix == '.xlsx':
|
||||
self._wb = load_workbook(str(path))
|
||||
return
|
||||
pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx')))
|
||||
self._wb = load_workbook(str(path.with_suffix('.xlsx')))
|
||||
|
||||
def clean_up_wb(self) -> pd.DataFrame:
|
||||
triggered = False
|
||||
for row in self._sheet.rows:
|
||||
for cell in row:
|
||||
cell: Cell
|
||||
row: Row
|
||||
if isinstance(cell.value, str) and cell.value.startswith('№ заявки'):
|
||||
self._sheet.delete_rows(1, cell.row - 1)
|
||||
triggered = True
|
||||
break
|
||||
|
||||
if triggered:
|
||||
break
|
||||
|
||||
df = pd.DataFrame(self._sheet.values)
|
||||
not_nullable_cols = df.iloc[0].dropna().index
|
||||
df = df.iloc[:, not_nullable_cols]
|
||||
df.columns = df.iloc[0, :]
|
||||
df: pd.DataFrame = df.drop(0, axis=0)
|
||||
|
||||
df.to_excel('./text.xlsx')
|
||||
|
||||
return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0]
|
||||
|
||||
def calculate(self) -> int | None:
|
||||
df = self.clean_up_wb()
|
||||
query = {
|
||||
"type": TYPE,
|
||||
"token": TOKEN,
|
||||
"delivery": DELIVERY,
|
||||
"pickup": PICKUP,
|
||||
"user": USER,
|
||||
"addr_from": df['Адрес отгрузки'],
|
||||
"addr_to": df['Адрес склада'],
|
||||
"weight": df['Масса'],
|
||||
"volume": df['Объем'],
|
||||
}
|
||||
if query['volume'] is None:
|
||||
query['volume'] = 0.01
|
||||
print(query)
|
||||
data = requests.get(API_CALC_URL, params=query).json()
|
||||
print(data)
|
||||
|
||||
if data.get('price', None) is not None:
|
||||
return int(data['price'])
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ExcelParser('./downloads/1342AWP1A.xls')
|
||||
print(parser.calculate())
|
53
main.py
53
main.py
|
@ -1,15 +1,18 @@
|
|||
import pathlib
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
from typing import Self
|
||||
|
||||
from selenium.webdriver import Keys
|
||||
from selenium.webdriver import Keys, ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.support.wait import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
||||
from excel_parser import ExcelParser
|
||||
from storage import Storage
|
||||
|
||||
|
||||
class Parser:
|
||||
|
@ -23,11 +26,14 @@ class Parser:
|
|||
def __init__(self):
|
||||
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
|
||||
self._options.add_experimental_option("prefs", prefs)
|
||||
self._options.add_argument("--disable-extensions")
|
||||
self._options.add_argument("--disable-gpu")
|
||||
self._options.add_argument("--headless=new")
|
||||
# self._options.add_argument("--disable-extensions")
|
||||
# self._options.add_argument("--disable-gpu")
|
||||
# self._options.add_argument("--headless=new")
|
||||
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
|
||||
|
||||
self.storage = Storage()
|
||||
self.storage.create_tables()
|
||||
|
||||
def __enter__(self) -> Self:
|
||||
self._driver = webdriver.Chrome(service=self._service, options=self._options)
|
||||
return self
|
||||
|
@ -84,28 +90,51 @@ class Parser:
|
|||
|
||||
# Скачать документацию
|
||||
try:
|
||||
download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
|
||||
time.sleep(1)
|
||||
download_documentation_button = self.find_elem(By.CSS_SELECTOR,
|
||||
'#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
|
||||
time.sleep(5)
|
||||
ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform()
|
||||
download_documentation_button.click()
|
||||
self.download_documentation()
|
||||
fp = self.download_documentation()
|
||||
|
||||
for file in fp:
|
||||
e_parser = ExcelParser(file)
|
||||
price = e_parser.calculate()
|
||||
if not price:
|
||||
break
|
||||
self.storage.add_link(str(file.absolute()), url, price)
|
||||
|
||||
except NoSuchElementException:
|
||||
self.download_documentation()
|
||||
fp = self.download_documentation()
|
||||
|
||||
for file in fp:
|
||||
e_parser = ExcelParser(file)
|
||||
price = e_parser.calculate()
|
||||
if not price:
|
||||
break
|
||||
self.storage.add_link(str(file.absolute()), url, price)
|
||||
|
||||
def download_documentation(self) -> list[pathlib.Path]:
|
||||
all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
|
||||
|
||||
def download_documentation(self):
|
||||
time.sleep(5)
|
||||
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
|
||||
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
|
||||
for doc in docs:
|
||||
href = doc.get_attribute('href')
|
||||
if not href.endswith('.xlsx'):
|
||||
if not href.endswith('.xlsx') and not href.endswith('.xls'):
|
||||
continue
|
||||
self._driver.get(href)
|
||||
time.sleep(3)
|
||||
|
||||
all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
|
||||
|
||||
fp = all_files_2 - all_files_1
|
||||
|
||||
return [file for file in fp]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
with Parser() as parser:
|
||||
parser.login()
|
||||
parser.search()
|
||||
|
||||
|
|
|
@ -1,2 +1,12 @@
|
|||
selenium
|
||||
webdriver-manager
|
||||
webdriver-manager
|
||||
openpyxl
|
||||
pandas
|
||||
requests
|
||||
xlrd
|
||||
xlwt
|
||||
xlutils
|
||||
xls2xlsx
|
||||
pyexcel
|
||||
pyexcel-xls
|
||||
pyexcel-xlsx
|
|
@ -8,38 +8,95 @@ attrs==23.2.0
|
|||
# via
|
||||
# outcome
|
||||
# trio
|
||||
beautifulsoup4==4.12.3
|
||||
# via xls2xlsx
|
||||
certifi==2023.11.17
|
||||
# via
|
||||
# requests
|
||||
# selenium
|
||||
chardet==5.2.0
|
||||
# via
|
||||
# pyexcel
|
||||
# xls2xlsx
|
||||
charset-normalizer==3.3.2
|
||||
# via requests
|
||||
cssutils==2.9.0
|
||||
# via xls2xlsx
|
||||
currency-symbols==2.0.3
|
||||
# via xls2xlsx
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
exceptiongroup==1.2.0
|
||||
# via
|
||||
# trio
|
||||
# trio-websocket
|
||||
fonttools==4.47.2
|
||||
# via xls2xlsx
|
||||
h11==0.14.0
|
||||
# via wsproto
|
||||
idna==3.6
|
||||
# via
|
||||
# requests
|
||||
# trio
|
||||
lml==0.1.0
|
||||
# via
|
||||
# pyexcel
|
||||
# pyexcel-io
|
||||
numpy==1.26.3
|
||||
# via pandas
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements.in
|
||||
# pyexcel-xlsx
|
||||
# xls2xlsx
|
||||
outcome==1.3.0.post0
|
||||
# via trio
|
||||
packaging==23.2
|
||||
# via webdriver-manager
|
||||
pandas==2.2.0
|
||||
# via -r requirements.in
|
||||
pillow==10.2.0
|
||||
# via xls2xlsx
|
||||
pyexcel==0.7.0
|
||||
# via -r requirements.in
|
||||
pyexcel-io==0.6.6
|
||||
# via
|
||||
# pyexcel
|
||||
# pyexcel-xls
|
||||
# pyexcel-xlsx
|
||||
pyexcel-xls==0.7.0
|
||||
# via -r requirements.in
|
||||
pyexcel-xlsx==0.6.0
|
||||
# via -r requirements.in
|
||||
pysocks==1.7.1
|
||||
# via urllib3
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# pandas
|
||||
# xls2xlsx
|
||||
python-dotenv==1.0.1
|
||||
# via webdriver-manager
|
||||
pytz==2023.4
|
||||
# via pandas
|
||||
pyyaml==6.0.1
|
||||
# via xls2xlsx
|
||||
requests==2.31.0
|
||||
# via webdriver-manager
|
||||
# via
|
||||
# -r requirements.in
|
||||
# webdriver-manager
|
||||
# xls2xlsx
|
||||
selenium==4.17.2
|
||||
# via -r requirements.in
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
sniffio==1.3.0
|
||||
# via trio
|
||||
sortedcontainers==2.4.0
|
||||
# via trio
|
||||
soupsieve==2.5
|
||||
# via beautifulsoup4
|
||||
texttable==1.7.0
|
||||
# via pyexcel
|
||||
trio==0.24.0
|
||||
# via
|
||||
# selenium
|
||||
|
@ -48,12 +105,31 @@ trio-websocket==0.11.1
|
|||
# via selenium
|
||||
typing-extensions==4.9.0
|
||||
# via selenium
|
||||
tzdata==2023.4
|
||||
# via pandas
|
||||
urllib3[socks]==2.1.0
|
||||
# via
|
||||
# requests
|
||||
# selenium
|
||||
# urllib3
|
||||
webcolors==1.13
|
||||
# via xls2xlsx
|
||||
webdriver-manager==4.0.1
|
||||
# via -r requirements.in
|
||||
wsproto==1.2.0
|
||||
# via trio-websocket
|
||||
xlrd==2.0.1
|
||||
# via
|
||||
# -r requirements.in
|
||||
# pyexcel-xls
|
||||
# xls2xlsx
|
||||
# xlutils
|
||||
xls2xlsx==0.2.0
|
||||
# via -r requirements.in
|
||||
xlutils==2.0.0
|
||||
# via -r requirements.in
|
||||
xlwt==1.3.0
|
||||
# via
|
||||
# -r requirements.in
|
||||
# pyexcel-xls
|
||||
# xlutils
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
import sqlite3
|
||||
from contextlib import contextmanager
|
||||
from typing import ContextManager
|
||||
|
||||
|
||||
class Storage:
|
||||
def __init__(self):
|
||||
self.con = sqlite3.connect("database.db")
|
||||
|
||||
@contextmanager
|
||||
def get_cursor(self) -> ContextManager[sqlite3.Cursor]:
|
||||
cur = self.con.cursor()
|
||||
try:
|
||||
yield cur
|
||||
finally:
|
||||
cur.close()
|
||||
|
||||
def create_tables(self):
|
||||
with self.get_cursor() as cur:
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS ltl (
|
||||
id INTEGER PRIMARY KEY,
|
||||
doc_filepath TEXT,
|
||||
link TEXT,
|
||||
total_cost INTEGER
|
||||
);""")
|
||||
|
||||
def add_link(self, doc_filepath: str, link: str, total_cost: int):
|
||||
with self.get_cursor() as cur:
|
||||
cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)",
|
||||
(doc_filepath, link, total_cost))
|
||||
self.con.commit()
|
||||
|
||||
def is_link_exists(self, link: str) -> bool:
|
||||
with self.get_cursor() as cur:
|
||||
res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,))
|
||||
return res.fetchone()
|
||||
|
||||
def is_doc_exists(self, doc_filepath: str) -> bool:
|
||||
with self.get_cursor() as cur:
|
||||
res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,))
|
||||
return res.fetchone()
|
Loading…
Reference in New Issue