add parser

master
Ernest Litvinenko 2024-02-06 10:47:22 +03:00
parent 68b1f8d039
commit f69c4377bc
5 changed files with 258 additions and 14 deletions

87
excel_parser.py Normal file
View File

@ -0,0 +1,87 @@
import pathlib
from sqlite3 import Row
import requests
from openpyxl import load_workbook, Workbook
from openpyxl.cell import Cell
from openpyxl.worksheet.worksheet import Worksheet
from pathlib import Path
import pandas as pd
API_CALC_URL = "https://api.jde.ru/vD/calculator/PriceAddress"
TYPE = "1"
PICKUP = "1"
DELIVERY = "1"
USER = "2252130929823409"
TOKEN = "67065749269910593"
class ExcelParser:
def __init__(self, path: Path | str):
self._wb: Workbook | None = None
if isinstance(path, str):
path = Path(path)
assert path.is_file() is True
self.convert_to_xlsx(path)
self._sheet: Worksheet = self._wb.active
def convert_to_xlsx(self, path: pathlib.Path):
import pyexcel as pe
if path.suffix == '.xlsx':
self._wb = load_workbook(str(path))
return
pe.save_book_as(file_name=str(path), dest_file_name=str(path.with_suffix('.xlsx')))
self._wb = load_workbook(str(path.with_suffix('.xlsx')))
def clean_up_wb(self) -> pd.DataFrame:
triggered = False
for row in self._sheet.rows:
for cell in row:
cell: Cell
row: Row
if isinstance(cell.value, str) and cell.value.startswith('№ заявки'):
self._sheet.delete_rows(1, cell.row - 1)
triggered = True
break
if triggered:
break
df = pd.DataFrame(self._sheet.values)
not_nullable_cols = df.iloc[0].dropna().index
df = df.iloc[:, not_nullable_cols]
df.columns = df.iloc[0, :]
df: pd.DataFrame = df.drop(0, axis=0)
df.to_excel('./text.xlsx')
return df[['Адрес отгрузки', 'Адрес склада', 'Масса', 'Объем']].to_dict(orient='records')[0]
def calculate(self) -> int | None:
df = self.clean_up_wb()
query = {
"type": TYPE,
"token": TOKEN,
"delivery": DELIVERY,
"pickup": PICKUP,
"user": USER,
"addr_from": df['Адрес отгрузки'],
"addr_to": df['Адрес склада'],
"weight": df['Масса'],
"volume": df['Объем'],
}
if query['volume'] is None:
query['volume'] = 0.01
print(query)
data = requests.get(API_CALC_URL, params=query).json()
print(data)
if data.get('price', None) is not None:
return int(data['price'])
return None
if __name__ == '__main__':
parser = ExcelParser('./downloads/1342AWP1A.xls')
print(parser.calculate())

53
main.py
View File

@ -1,15 +1,18 @@
import pathlib
import os
import time
import re
from typing import Self
from selenium.webdriver import Keys
from selenium.webdriver import Keys, ActionChains
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from excel_parser import ExcelParser
from storage import Storage
class Parser:
@ -23,11 +26,14 @@ class Parser:
def __init__(self):
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())}
self._options.add_experimental_option("prefs", prefs)
self._options.add_argument("--disable-extensions")
self._options.add_argument("--disable-gpu")
self._options.add_argument("--headless=new")
# self._options.add_argument("--disable-extensions")
# self._options.add_argument("--disable-gpu")
# self._options.add_argument("--headless=new")
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
self.storage = Storage()
self.storage.create_tables()
def __enter__(self) -> Self:
self._driver = webdriver.Chrome(service=self._service, options=self._options)
return self
@ -84,28 +90,51 @@ class Parser:
# Скачать документацию
try:
download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
time.sleep(1)
download_documentation_button = self.find_elem(By.CSS_SELECTOR,
'#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
time.sleep(5)
ActionChains(self._driver).scroll_to_element(download_documentation_button).scroll_by_amount(0, 100).perform()
download_documentation_button.click()
self.download_documentation()
fp = self.download_documentation()
for file in fp:
e_parser = ExcelParser(file)
price = e_parser.calculate()
if not price:
break
self.storage.add_link(str(file.absolute()), url, price)
except NoSuchElementException:
self.download_documentation()
fp = self.download_documentation()
for file in fp:
e_parser = ExcelParser(file)
price = e_parser.calculate()
if not price:
break
self.storage.add_link(str(file.absolute()), url, price)
def download_documentation(self) -> list[pathlib.Path]:
all_files_1 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
def download_documentation(self):
time.sleep(5)
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
for doc in docs:
href = doc.get_attribute('href')
if not href.endswith('.xlsx'):
if not href.endswith('.xlsx') and not href.endswith('.xls'):
continue
self._driver.get(href)
time.sleep(3)
all_files_2 = set(pathlib.Path('./downloads') / pathlib.Path(file) for tree in os.walk('./downloads') for file in tree[2])
fp = all_files_2 - all_files_1
return [file for file in fp]
if __name__ == "__main__":
with Parser() as parser:
parser.login()
parser.search()

View File

@ -1,2 +1,12 @@
selenium
webdriver-manager
openpyxl
pandas
requests
xlrd
xlwt
xlutils
xls2xlsx
pyexcel
pyexcel-xls
pyexcel-xlsx

View File

@ -8,38 +8,95 @@ attrs==23.2.0
# via
# outcome
# trio
beautifulsoup4==4.12.3
# via xls2xlsx
certifi==2023.11.17
# via
# requests
# selenium
chardet==5.2.0
# via
# pyexcel
# xls2xlsx
charset-normalizer==3.3.2
# via requests
cssutils==2.9.0
# via xls2xlsx
currency-symbols==2.0.3
# via xls2xlsx
et-xmlfile==1.1.0
# via openpyxl
exceptiongroup==1.2.0
# via
# trio
# trio-websocket
fonttools==4.47.2
# via xls2xlsx
h11==0.14.0
# via wsproto
idna==3.6
# via
# requests
# trio
lml==0.1.0
# via
# pyexcel
# pyexcel-io
numpy==1.26.3
# via pandas
openpyxl==3.1.2
# via
# -r requirements.in
# pyexcel-xlsx
# xls2xlsx
outcome==1.3.0.post0
# via trio
packaging==23.2
# via webdriver-manager
pandas==2.2.0
# via -r requirements.in
pillow==10.2.0
# via xls2xlsx
pyexcel==0.7.0
# via -r requirements.in
pyexcel-io==0.6.6
# via
# pyexcel
# pyexcel-xls
# pyexcel-xlsx
pyexcel-xls==0.7.0
# via -r requirements.in
pyexcel-xlsx==0.6.0
# via -r requirements.in
pysocks==1.7.1
# via urllib3
python-dateutil==2.8.2
# via
# pandas
# xls2xlsx
python-dotenv==1.0.1
# via webdriver-manager
pytz==2023.4
# via pandas
pyyaml==6.0.1
# via xls2xlsx
requests==2.31.0
# via webdriver-manager
# via
# -r requirements.in
# webdriver-manager
# xls2xlsx
selenium==4.17.2
# via -r requirements.in
six==1.16.0
# via python-dateutil
sniffio==1.3.0
# via trio
sortedcontainers==2.4.0
# via trio
soupsieve==2.5
# via beautifulsoup4
texttable==1.7.0
# via pyexcel
trio==0.24.0
# via
# selenium
@ -48,12 +105,31 @@ trio-websocket==0.11.1
# via selenium
typing-extensions==4.9.0
# via selenium
tzdata==2023.4
# via pandas
urllib3[socks]==2.1.0
# via
# requests
# selenium
# urllib3
webcolors==1.13
# via xls2xlsx
webdriver-manager==4.0.1
# via -r requirements.in
wsproto==1.2.0
# via trio-websocket
xlrd==2.0.1
# via
# -r requirements.in
# pyexcel-xls
# xls2xlsx
# xlutils
xls2xlsx==0.2.0
# via -r requirements.in
xlutils==2.0.0
# via -r requirements.in
xlwt==1.3.0
# via
# -r requirements.in
# pyexcel-xls
# xlutils

42
storage.py Normal file
View File

@ -0,0 +1,42 @@
import sqlite3
from contextlib import contextmanager
from typing import ContextManager
class Storage:
def __init__(self):
self.con = sqlite3.connect("database.db")
@contextmanager
def get_cursor(self) -> ContextManager[sqlite3.Cursor]:
cur = self.con.cursor()
try:
yield cur
finally:
cur.close()
def create_tables(self):
with self.get_cursor() as cur:
cur.execute("""
CREATE TABLE IF NOT EXISTS ltl (
id INTEGER PRIMARY KEY,
doc_filepath TEXT,
link TEXT,
total_cost INTEGER
);""")
def add_link(self, doc_filepath: str, link: str, total_cost: int):
with self.get_cursor() as cur:
cur.execute("INSERT INTO ltl (doc_filepath, link, total_cost) VALUES (?, ?, ?)",
(doc_filepath, link, total_cost))
self.con.commit()
def is_link_exists(self, link: str) -> bool:
with self.get_cursor() as cur:
res = cur.execute("SELECT * FROM ltl WHERE link = ?", (link,))
return res.fetchone()
def is_doc_exists(self, doc_filepath: str) -> bool:
with self.get_cursor() as cur:
res = cur.execute("SELECT * FROM ltl WHERE doc_filepath = ?", (doc_filepath,))
return res.fetchone()