auth + downloading xlsx files
parent
26aaaf672a
commit
45b540b753
|
@ -0,0 +1,108 @@
|
||||||
|
import pathlib
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
from selenium.webdriver import Keys
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
||||||
|
|
||||||
|
|
||||||
|
class Parser:
|
||||||
|
keyword = "Велесстрой"
|
||||||
|
url = "https://www.b2b-center.ru/market"
|
||||||
|
|
||||||
|
_driver: webdriver.Chrome
|
||||||
|
_options: webdriver.ChromeOptions = webdriver.ChromeOptions()
|
||||||
|
_service: webdriver.ChromeService
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())};
|
||||||
|
self._options.add_experimental_option("prefs", prefs)
|
||||||
|
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
|
||||||
|
|
||||||
|
def __enter__(self) -> Self:
|
||||||
|
self._driver = webdriver.Chrome(service=self._service, options=self._options)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
print("Gracefully shutting down...")
|
||||||
|
self._driver.close()
|
||||||
|
|
||||||
|
def find_elem(self, by: str, value: str):
|
||||||
|
try:
|
||||||
|
return WebDriverWait(self._driver, 10).until(EC.presence_of_element_located((by, value)))
|
||||||
|
except TimeoutException:
|
||||||
|
raise NoSuchElementException("Element not found")
|
||||||
|
|
||||||
|
def login(self):
|
||||||
|
self._driver.get(self.url)
|
||||||
|
|
||||||
|
time.sleep(3)
|
||||||
|
# Open login modal
|
||||||
|
element = self.find_elem(By.ID, "auth_ajax_modal_trigger")
|
||||||
|
element.click()
|
||||||
|
|
||||||
|
login_control = self.find_elem(By.ID, "login_control")
|
||||||
|
password_control = self.find_elem(By.ID, "password_control")
|
||||||
|
|
||||||
|
login_control.click()
|
||||||
|
login_control.clear()
|
||||||
|
login_control.send_keys("jde2015")
|
||||||
|
|
||||||
|
password_control.click()
|
||||||
|
password_control.clear()
|
||||||
|
password_control.send_keys("Bel8#Ans3")
|
||||||
|
time.sleep(5)
|
||||||
|
password_control.send_keys(Keys.RETURN)
|
||||||
|
|
||||||
|
def search(self):
|
||||||
|
time.sleep(5)
|
||||||
|
self._driver.get(self.url + f'/?f_keyword={self.keyword}')
|
||||||
|
time.sleep(10)
|
||||||
|
table = self.find_elem(By.CSS_SELECTOR, ".search-results > tbody")
|
||||||
|
links = table.find_elements(By.CSS_SELECTOR, "tr > td:nth-child(1) > a")
|
||||||
|
for link in links:
|
||||||
|
href = link.get_attribute("href")
|
||||||
|
description = link.find_element(By.CSS_SELECTOR, 'div').text
|
||||||
|
|
||||||
|
# Check LTL
|
||||||
|
if len(re.findall(r"[Ll][Tt][Ll]", description)) != 0:
|
||||||
|
self.accept_documentation(href)
|
||||||
|
break
|
||||||
|
|
||||||
|
def accept_documentation(self, url: str):
|
||||||
|
time.sleep(3)
|
||||||
|
self._driver.get(url)
|
||||||
|
|
||||||
|
# Скачать документацию
|
||||||
|
try:
|
||||||
|
download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
|
||||||
|
time.sleep(1)
|
||||||
|
download_documentation_button.click()
|
||||||
|
self.download_documentation()
|
||||||
|
|
||||||
|
except NoSuchElementException:
|
||||||
|
self.download_documentation()
|
||||||
|
|
||||||
|
def download_documentation(self):
|
||||||
|
time.sleep(5)
|
||||||
|
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
|
||||||
|
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
|
||||||
|
for doc in docs:
|
||||||
|
href = doc.get_attribute('href')
|
||||||
|
if not href.endswith('.xlsx'):
|
||||||
|
continue
|
||||||
|
self._driver.get(href)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
with Parser() as parser:
|
||||||
|
parser.login()
|
||||||
|
parser.search()
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
selenium
|
||||||
|
webdriver-manager
|
|
@ -0,0 +1,59 @@
|
||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile with Python 3.10
|
||||||
|
# by the following command:
|
||||||
|
#
|
||||||
|
# pip-compile
|
||||||
|
#
|
||||||
|
attrs==23.2.0
|
||||||
|
# via
|
||||||
|
# outcome
|
||||||
|
# trio
|
||||||
|
certifi==2023.11.17
|
||||||
|
# via
|
||||||
|
# requests
|
||||||
|
# selenium
|
||||||
|
charset-normalizer==3.3.2
|
||||||
|
# via requests
|
||||||
|
exceptiongroup==1.2.0
|
||||||
|
# via
|
||||||
|
# trio
|
||||||
|
# trio-websocket
|
||||||
|
h11==0.14.0
|
||||||
|
# via wsproto
|
||||||
|
idna==3.6
|
||||||
|
# via
|
||||||
|
# requests
|
||||||
|
# trio
|
||||||
|
outcome==1.3.0.post0
|
||||||
|
# via trio
|
||||||
|
packaging==23.2
|
||||||
|
# via webdriver-manager
|
||||||
|
pysocks==1.7.1
|
||||||
|
# via urllib3
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
# via webdriver-manager
|
||||||
|
requests==2.31.0
|
||||||
|
# via webdriver-manager
|
||||||
|
selenium==4.17.2
|
||||||
|
# via -r requirements.in
|
||||||
|
sniffio==1.3.0
|
||||||
|
# via trio
|
||||||
|
sortedcontainers==2.4.0
|
||||||
|
# via trio
|
||||||
|
trio==0.24.0
|
||||||
|
# via
|
||||||
|
# selenium
|
||||||
|
# trio-websocket
|
||||||
|
trio-websocket==0.11.1
|
||||||
|
# via selenium
|
||||||
|
typing-extensions==4.9.0
|
||||||
|
# via selenium
|
||||||
|
urllib3[socks]==2.1.0
|
||||||
|
# via
|
||||||
|
# requests
|
||||||
|
# selenium
|
||||||
|
# urllib3
|
||||||
|
webdriver-manager==4.0.1
|
||||||
|
# via -r requirements.in
|
||||||
|
wsproto==1.2.0
|
||||||
|
# via trio-websocket
|
Loading…
Reference in New Issue