Update parser and new feature

This commit is contained in:
2022-10-07 17:50:42 +03:00
parent ae25f2e2a5
commit 8f0c7e0d9a
10 changed files with 83 additions and 35 deletions

View File

@@ -1,14 +1,18 @@
import requests
import base64
import json
import datetime
from datetime import datetime as dt
import requests
from bs4 import BeautifulSoup
try:
from load import config
except: config = None
from .utils import *
except ImportError: config = None
try:
from .utils import *
except ImportError:
from utils import *
headers = {
@@ -41,10 +45,11 @@ def docs_parse():
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
try: output = table_parser(soup, output); #print(output)
except Exception: pass
try: output = test_parser(soup, output)
except Exception as e: raise(e)
url = image_parser(soup)
with requests.get(url=url, allow_redirects=True, stream=True) as r:
output['image'] = True
output['date'] = 'невозможно получить!'
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
with open(config.data_file, 'w') as f:
@@ -57,3 +62,4 @@ def get_about_replacements() -> dict:
data = json.loads(f.read())
f.close()
return data
docs_parse()

View File

@@ -1,5 +1,6 @@
from bs4 import BeautifulSoup
def table_parser(soup, output):
def table_parser(soup: BeautifulSoup, output):
#Date parser
date = (soup.find("main").findAll('span', style="color:black"))[1]
output["date"] = date.text.replace(u'\xa0', u'')
@@ -22,29 +23,9 @@ def table_parser(soup, output):
return output
def text_parser(soup, output):
main = soup.find("main")
text: str = ''
for j in main:
r_text = (
j.text
.replace(u"\xa0", u"")
.lstrip(" ").lower()
.replace("\r", "")
.replace("увага! навчання дистанційно!!!", "")
.replace("заміни до розкладу", "")
)
if r_text.replace("\n", "") == "": continue
text += r_text
data = text.split("\n")
output["date"] = data[1]
for p in data[2:]:
if p == "": continue
group, replaces = p.split(" ", maxsplit=1)
output["data"][group] = replaces
def image_parser(soup: BeautifulSoup):
main = soup.find("p", style="text-align:center; margin:0cm 0cm 8pt")
image = main.select_one('img[src$=".jpg"]')
output = image['src']
return output