Доработан парсер
This commit is contained in:
parent
617b503a3b
commit
ae25f2e2a5
2
bot.py
2
bot.py
@ -48,7 +48,7 @@ def main() -> None:
|
|||||||
#loop = asyncio.get_event_loop()
|
#loop = asyncio.get_event_loop()
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
asyncio.set_event_loop(loop)
|
asyncio.set_event_loop(loop)
|
||||||
#loop.create_task(scheduler())
|
loop.create_task(scheduler())
|
||||||
|
|
||||||
if config.bot("use_webhook").lower() in ['t', 'true', '1', 'yes', 'y']:
|
if config.bot("use_webhook").lower() in ['t', 'true', '1', 'yes', 'y']:
|
||||||
executor.start_webhook(
|
executor.start_webhook(
|
||||||
|
@ -38,16 +38,12 @@ def docs_parse():
|
|||||||
page = requests.get(config.bot("link"), headers=headers)
|
page = requests.get(config.bot("link"), headers=headers)
|
||||||
page.encoding = 'utf-8'
|
page.encoding = 'utf-8'
|
||||||
|
|
||||||
soup = BeautifulSoup(page.text, "html.parser")
|
soup = BeautifulSoup(page.text, "lxml")
|
||||||
|
|
||||||
# Это в идеале нужно переписать...
|
# Это в идеале нужно переписать...
|
||||||
try: output = table_parser(soup, output); #print(output)
|
try: output = table_parser(soup, output); #print(output)
|
||||||
except Exception: pass
|
except Exception: pass
|
||||||
try: output = one_parser(soup, output); #print(output)
|
try: output = test_parser(soup, output)
|
||||||
except Exception: pass
|
|
||||||
try: output = parser_two(soup, output); #print(output)
|
|
||||||
#except Exception as e: pass
|
|
||||||
#try: output = parser3(soup, output); print(output)
|
|
||||||
except Exception as e: raise(e)
|
except Exception as e: raise(e)
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,48 +21,30 @@ def table_parser(soup, output):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def one_parser(soup, output):
|
|
||||||
raw_data = soup.find("main").findAll("p")
|
|
||||||
date = (
|
|
||||||
raw_data[3].text.lower()
|
|
||||||
.replace(u"\xa0", u"").replace("на", "").replace("\r", "")
|
|
||||||
.replace("ЗАМІНИ ДО РОЗКЛАДУ".lower(), "").split("\n")
|
|
||||||
)
|
|
||||||
output["date"] = date[0].lstrip(" ")
|
|
||||||
|
|
||||||
for p in raw_data[4].text.replace(u"\xa0", u"").split("\n"):
|
def text_parser(soup, output):
|
||||||
if p == "": continue
|
main = soup.find("main")
|
||||||
data_rep = (p.lstrip(" ").split(" ", 1))
|
|
||||||
group = data_rep[0]
|
|
||||||
text = data_rep[1].replace("\r", "").lstrip(" ")
|
|
||||||
output["data"][group] = text
|
|
||||||
return output
|
|
||||||
|
|
||||||
def parser_two(soup, output):
|
text: str = ''
|
||||||
raw_data = soup.find("main").findAll("p")[2]
|
for j in main:
|
||||||
data = raw_data.text.split("\n")
|
r_text = (
|
||||||
output["date"] = data[1].replace("\r", "")
|
j.text
|
||||||
|
.replace(u"\xa0", u"")
|
||||||
for p in data[3:]:
|
.lstrip(" ").lower()
|
||||||
r_data = p.split(maxsplit=1)
|
|
||||||
try:
|
|
||||||
group = r_data[0].replace(u"\xa0", u"").replace("\r", "")
|
|
||||||
text = r_data[1].replace(u"\xa0", u"").replace("\r", "")
|
|
||||||
except IndexError: break
|
|
||||||
output["data"][group] = text
|
|
||||||
return output
|
|
||||||
|
|
||||||
def parser3(soup, output):
|
|
||||||
raw_data = soup.find("main").findAll("p")
|
|
||||||
|
|
||||||
output["date"] = (
|
|
||||||
raw_data[2].text
|
|
||||||
.replace("\r", "")
|
.replace("\r", "")
|
||||||
.replace("ЗАМІНИ НА", "").lstrip(" ").rstrip(" ").lower()
|
.replace("увага! навчання дистанційно!!!", "")
|
||||||
|
.replace("заміни до розкладу", "")
|
||||||
)
|
)
|
||||||
for p in raw_data[5:]:
|
if r_text.replace("\n", "") == "": continue
|
||||||
r_data = p.text.split("-", maxsplit=1)
|
text += r_text
|
||||||
group = r_data[0]
|
|
||||||
text = r_data[1]
|
data = text.split("\n")
|
||||||
output["data"][group] = text
|
|
||||||
|
output["date"] = data[1]
|
||||||
|
|
||||||
|
for p in data[2:]:
|
||||||
|
if p == "": continue
|
||||||
|
group, replaces = p.split(" ", maxsplit=1)
|
||||||
|
output["data"][group] = replaces
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
#google-auth-httplib2
|
#google-auth-httplib2
|
||||||
#google-auth-oauthlib
|
#google-auth-oauthlib
|
||||||
bs4
|
bs4
|
||||||
|
lxml
|
||||||
peewee
|
peewee
|
||||||
aiogram
|
aiogram
|
||||||
cryptography
|
cryptography
|
||||||
|
Loading…
Reference in New Issue
Block a user