Доработан парсер

2022-02-22 14:29:44 +02:00 · 2022-02-22 14:29:44 +02:00 · ae25f2e2a5
commit ae25f2e2a5
parent 617b503a3b
4 changed files with 29 additions and 50 deletions
--- a/bot.py
+++ b/bot.py
@ -48,7 +48,7 @@ def main() -> None:
    #loop = asyncio.get_event_loop()
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
-    #loop.create_task(scheduler())
+    loop.create_task(scheduler())
    if config.bot("use_webhook").lower() in ['t', 'true', '1', 'yes', 'y']:
        executor.start_webhook(
--- a/parser/parser.py
+++ b/parser/parser.py
@ -38,16 +38,12 @@ def docs_parse():
    page = requests.get(config.bot("link"), headers=headers)
    page.encoding = 'utf-8'
-    soup = BeautifulSoup(page.text, "html.parser")
+    soup = BeautifulSoup(page.text, "lxml")
    # Это в идеале нужно переписать...
    try: output = table_parser(soup, output); #print(output)
    except Exception: pass
-    try: output = one_parser(soup, output); #print(output)
+    try: output = test_parser(soup, output)
    except Exception: pass
    try: output = parser_two(soup, output); #print(output)
    #except Exception as e: pass
    #try: output = parser3(soup, output); print(output)
    except Exception as e: raise(e)
--- a/parser/utils.py
+++ b/parser/utils.py
@ -21,48 +21,30 @@ def table_parser(soup, output):
    return output
 def one_parser(soup, output):
    raw_data = soup.find("main").findAll("p")
    date = (
        raw_data[3].text.lower()
        .replace(u"\xa0", u"").replace("на", "").replace("\r", "")
        .replace("ЗАМІНИ ДО РОЗКЛАДУ".lower(), "").split("\n")
    )
    output["date"] = date[0].lstrip(" ")
-    for p in raw_data[4].text.replace(u"\xa0", u"").split("\n"):
+def text_parser(soup, output):
-        if p == "": continue
+    main = soup.find("main")
        data_rep = (p.lstrip(" ").split(" ", 1))
        group = data_rep[0]
        text = data_rep[1].replace("\r", "").lstrip(" ")
        output["data"][group] = text
    return output
-def parser_two(soup, output):
+    text: str = ''
-    raw_data = soup.find("main").findAll("p")[2]
+    for j in main:
-    data = raw_data.text.split("\n")
+        r_text = (
-    output["date"] = data[1].replace("\r", "")
+            j.text
-
+            .replace(u"\xa0", u"")
-    for p in data[3:]:
+            .lstrip(" ").lower()
        r_data = p.split(maxsplit=1)
        try:
            group = r_data[0].replace(u"\xa0", u"").replace("\r", "")
            text = r_data[1].replace(u"\xa0", u"").replace("\r", "")
        except IndexError: break
        output["data"][group] = text
    return output
 def parser3(soup, output): 
    raw_data = soup.find("main").findAll("p")
    output["date"] = (
        raw_data[2].text
            .replace("\r", "")
-        .replace("ЗАМІНИ НА", "").lstrip(" ").rstrip(" ").lower()
+            .replace("увага! навчання дистанційно!!!", "")
            .replace("заміни до розкладу", "")
        )
-    for p in raw_data[5:]: 
+        if r_text.replace("\n", "") == "": continue
-        r_data = p.text.split("-", maxsplit=1)
+        text += r_text
-        group = r_data[0]
+
-        text = r_data[1]
+    data = text.split("\n")
-        output["data"][group] = text
+
    output["date"] = data[1]
    for p in data[2:]:
        if p == "": continue
        group, replaces = p.split(" ", maxsplit=1)
        output["data"][group] = replaces
    return output
--- a/requirements.txt
+++ b/requirements.txt
@ -2,6 +2,7 @@
 #google-auth-httplib2
 #google-auth-oauthlib
 bs4
 lxml
 peewee
 aiogram
 cryptography