Доработан парсер

This commit is contained in:
tema 2022-02-22 14:29:44 +02:00
parent 617b503a3b
commit ae25f2e2a5
Signed by: tema
GPG Key ID: 21FDB6D162488F6F
4 changed files with 29 additions and 50 deletions

2
bot.py
View File

@ -48,7 +48,7 @@ def main() -> None:
#loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
#loop.create_task(scheduler())
loop.create_task(scheduler())
if config.bot("use_webhook").lower() in ['t', 'true', '1', 'yes', 'y']:
executor.start_webhook(

View File

@ -38,16 +38,12 @@ def docs_parse():
page = requests.get(config.bot("link"), headers=headers)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "html.parser")
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
try: output = table_parser(soup, output); #print(output)
except Exception: pass
try: output = one_parser(soup, output); #print(output)
except Exception: pass
try: output = parser_two(soup, output); #print(output)
#except Exception as e: pass
#try: output = parser3(soup, output); print(output)
try: output = test_parser(soup, output)
except Exception as e: raise(e)

View File

@ -21,48 +21,30 @@ def table_parser(soup, output):
return output
def one_parser(soup, output):
raw_data = soup.find("main").findAll("p")
date = (
raw_data[3].text.lower()
.replace(u"\xa0", u"").replace("на", "").replace("\r", "")
.replace("ЗАМІНИ ДО РОЗКЛАДУ".lower(), "").split("\n")
)
output["date"] = date[0].lstrip(" ")
for p in raw_data[4].text.replace(u"\xa0", u"").split("\n"):
if p == "": continue
data_rep = (p.lstrip(" ").split(" ", 1))
group = data_rep[0]
text = data_rep[1].replace("\r", "").lstrip(" ")
output["data"][group] = text
return output
def text_parser(soup, output):
main = soup.find("main")
def parser_two(soup, output):
raw_data = soup.find("main").findAll("p")[2]
data = raw_data.text.split("\n")
output["date"] = data[1].replace("\r", "")
for p in data[3:]:
r_data = p.split(maxsplit=1)
try:
group = r_data[0].replace(u"\xa0", u"").replace("\r", "")
text = r_data[1].replace(u"\xa0", u"").replace("\r", "")
except IndexError: break
output["data"][group] = text
return output
def parser3(soup, output):
raw_data = soup.find("main").findAll("p")
output["date"] = (
raw_data[2].text
text: str = ''
for j in main:
r_text = (
j.text
.replace(u"\xa0", u"")
.lstrip(" ").lower()
.replace("\r", "")
.replace("ЗАМІНИ НА", "").lstrip(" ").rstrip(" ").lower()
.replace("увага! навчання дистанційно!!!", "")
.replace("заміни до розкладу", "")
)
for p in raw_data[5:]:
r_data = p.text.split("-", maxsplit=1)
group = r_data[0]
text = r_data[1]
output["data"][group] = text
if r_text.replace("\n", "") == "": continue
text += r_text
data = text.split("\n")
output["date"] = data[1]
for p in data[2:]:
if p == "": continue
group, replaces = p.split(" ", maxsplit=1)
output["data"][group] = replaces
return output

View File

@ -2,6 +2,7 @@
#google-auth-httplib2
#google-auth-oauthlib
bs4
lxml
peewee
aiogram
cryptography