b2bcenter-parser/main.py

112 lines
3.8 KiB
Python
Raw Normal View History

2024-01-26 16:21:44 +03:00
import pathlib
import time
import re
from typing import Self
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
class Parser:
keyword = "Велесстрой"
url = "https://www.b2b-center.ru/market"
_driver: webdriver.Chrome
_options: webdriver.ChromeOptions = webdriver.ChromeOptions()
_service: webdriver.ChromeService
def __init__(self):
prefs = {"download.default_directory": str(pathlib.Path('./downloads').absolute())};
self._options.add_experimental_option("prefs", prefs)
2024-01-26 16:54:58 +03:00
self._options.add_argument("--disable-extensions")
self._options.add_argument("--disable-gpu")
self._options.add_argument("--headless=new")
2024-01-26 16:21:44 +03:00
self._service = webdriver.ChromeService(executable_path=ChromeDriverManager().install())
def __enter__(self) -> Self:
self._driver = webdriver.Chrome(service=self._service, options=self._options)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
print("Gracefully shutting down...")
self._driver.close()
def find_elem(self, by: str, value: str):
try:
return WebDriverWait(self._driver, 10).until(EC.presence_of_element_located((by, value)))
except TimeoutException:
raise NoSuchElementException("Element not found")
def login(self):
self._driver.get(self.url)
time.sleep(3)
# Open login modal
element = self.find_elem(By.ID, "auth_ajax_modal_trigger")
element.click()
login_control = self.find_elem(By.ID, "login_control")
password_control = self.find_elem(By.ID, "password_control")
login_control.click()
login_control.clear()
login_control.send_keys("jde2015")
password_control.click()
password_control.clear()
password_control.send_keys("Bel8#Ans3")
time.sleep(5)
password_control.send_keys(Keys.RETURN)
def search(self):
time.sleep(5)
self._driver.get(self.url + f'/?f_keyword={self.keyword}')
time.sleep(10)
table = self.find_elem(By.CSS_SELECTOR, ".search-results > tbody")
links = table.find_elements(By.CSS_SELECTOR, "tr > td:nth-child(1) > a")
for link in links:
href = link.get_attribute("href")
description = link.find_element(By.CSS_SELECTOR, 'div').text
# Check LTL
if len(re.findall(r"[Ll][Tt][Ll]", description)) != 0:
self.accept_documentation(href)
break
def accept_documentation(self, url: str):
time.sleep(3)
self._driver.get(url)
# Скачать документацию
try:
download_documentation_button = self.find_elem(By.CSS_SELECTOR, '#auction_info_td > table > tbody > tr:nth-child(4) table input[type=submit]')
time.sleep(1)
download_documentation_button.click()
self.download_documentation()
except NoSuchElementException:
self.download_documentation()
def download_documentation(self):
time.sleep(5)
documentation_block = self.find_elem(By.CSS_SELECTOR, '#download_documentation')
docs = documentation_block.find_elements(By.CSS_SELECTOR, 'a')
for doc in docs:
href = doc.get_attribute('href')
if not href.endswith('.xlsx'):
continue
self._driver.get(href)
time.sleep(3)
if __name__ == "__main__":
with Parser() as parser:
parser.login()
parser.search()