1

2023-09-05 00:05:17 +03:00
parent 8754cf841e
commit 9d60205841
6 changed files with 458 additions and 90 deletions
--- a/parser/parser.py
+++ b/parser/parser.py
@@ -1,65 +1,117 @@
-import base64
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import os
 import json
-import datetime
-from datetime import datetime as dt

-import requests
-from bs4 import BeautifulSoup
+from googleapiclient.discovery import build
+from google_auth_oauthlib.flow import InstalledAppFlow
+from google.auth.transport.requests import Request
+from google.oauth2.credentials import Credentials

-try:
-    from load import config
-except ImportError: config = None
-try:
-    from .utils import *
-except ImportError: 
-    from utils import *
+from load import config
+from .utils import Helper
+
+# If modifying these scopes, delete the file token.json.
+SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
+
+__all__ = ['docs_parse', 'get_about_replacements']


-headers = {
-    'user-agent':(
-        "Mozilla/5.0 (Windows NT 10.0; WOW64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/62.0.3202.9 Safari/537.36"
-    )
-}
-
-
-def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
-    return dt.strftime(
-            dt.now() +
-            datetime.timedelta(days=days),
-            parse
+def docs_parse() -> None:
+    creds = None
+    # The file token.json stores the user's access and refresh tokens, and is
+    # created automatically when the authorization flow completes for the first
+    # time.
+    if os.path.exists(config.token_file):
+        creds = Credentials.from_authorized_user_file(
+            config.token_file,
+            SCOPES
        )
+    # If there are no (valid) credentials available, let the user log in.
+    if not creds or not creds.valid:
+        if creds and creds.expired and creds.refresh_token:
+            creds.refresh(Request())
+        else:
+            flow = InstalledAppFlow.from_client_secrets_file(
+                config.credentials_file, SCOPES)
+            creds = flow.run_local_server(port=0)
+        # Save the credentials for the next run
+        with open(config.token_file, 'w') as token:
+            token.write(creds.to_json())

+    service = build('docs', 'v1', credentials=creds)

-def docs_parse():
-
-    output = {
-        "data":{},
-        "another_teacher":None
-    }
-
-    page = requests.get(config.link, headers=headers)
-    page.encoding = 'utf-8'
-
-    soup = BeautifulSoup(page.text, "lxml")
-
-    # Это в идеале нужно переписать...
-    url = image_parser(soup)
-    with requests.get(url=url, allow_redirects=True, stream=True) as r:
-        output['image'] = True
-        output['date'] = 'невозможно получить!'
-        output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
-
+    # Retrieve the documents contents from the Docs service.
+    document = service.documents().get(documentId=config.documentid).execute()
+    if os.path.exists(config.data_file):
+        os.remove(config.data_file)

    with open(config.data_file, 'w') as f:
-        json.dump(output, f, ensure_ascii=False)
+        json.dump(document, f, ensure_ascii=False)
        f.close()


-def get_about_replacements() -> dict:
+def read_parse_data():
    with open(config.data_file, 'r') as f:
        data = json.loads(f.read())
        f.close()
        return data
-docs_parse()
+
+
+def get_about_replacements() -> dict:
+    helper = Helper()
+    document = read_parse_data()
+    info = []
+    element = helper.get_table_element()
+
+    try:
+        count = document['body']["content"][element]["table"]["rows"]
+    except (IndexError, KeyError):
+        image, image_bytes = helper.find_image(document)
+        if not image:
+            element = helper.find_with_table(document)
+            if element:
+                count = document['body']["content"][element]["table"]["rows"]
+            else:
+                info = helper.find_with_text(document)
+
+    date = helper.get_date(document)
+
+    another_teacher = helper.teacher(document)
+
+    if element:
+        for c in range(0, count):
+            more_replaces = (document['body']
+                ["content"][element]["table"]
+                ["tableRows"][c]["tableCells"][1]
+                ["content"]
+            )
+            replaces = ''
+            for i in range(0, len(more_replaces)):
+                replaces += (document['body']["content"][element]["table"]
+                        ["tableRows"][c]["tableCells"][1]
+                        ["content"][i]["paragraph"]["elements"][0]
+                        ["textRun"]["content"].rstrip("\n"))
+            
+            info.append(
+                (
+                    document['body']["content"][element]["table"]
+                    ["tableRows"][c]["tableCells"][0]
+                    ["content"][0]["paragraph"]["elements"][0]
+                    ["textRun"]["content"].rstrip("\n"),
+                    replaces
+                )
+            )
+
+    if image:
+        return {
+            "image": image,
+            'date': date if type(date) != type(False) else "Error" ,
+            'data': {"all": image_bytes},
+            'another_teacher': another_teacher,
+        }
+    return {
+        'date': date if type(date) != type(False) else "Error" ,
+        'data': dict(info),
+        'another_teacher': another_teacher,
+    }
--- a/parser/utils.py
+++ b/parser/utils.py
@@ -1,34 +1,212 @@
-from bs4 import BeautifulSoup
-from typing import Any
+import os
+import datetime
+from datetime import datetime as dt

-def table_parser(soup: BeautifulSoup, output):
-    #Date parser
-    date = (soup.find("main").findAll('span', style="color:black"))[1]
-    output["date"] = date.text.replace(u'\xa0', u'')
+import requests
+
+from load import config


-    #Replaces parser
-    replaces = soup.findAll('tr')
-    for data in replaces:
-        
-        text = (
-            data.find("td", valign="top")
-            .find("span", style="color:black")
-            .text.replace(u'\xa0', u'')
+def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
+    return dt.strftime(
+            dt.now() +
+            datetime.timedelta(days=days),
+            parse
        )
-        group = (
-            data.find("span", style="color:black")
-            .text.replace(" ", "").replace(u'\xa0', u''))
-        output["data"][group] = text
+
+'''
+self.months = {
+    1: "січень",
+    2: "лютий",
+    3: "березень",
+    4: "квітень",
+    5: "травень",
+    6: "червень",
+    7: "липень",
+    8: "серпень",
+    9: "вересень",
+    10: "жовтень",
+    11: "листопад",
+    12: "грудень"
+}
+'''
+
+class Helper():
+
+    def __init__(self):
+        self.date_now = date_parser_helper(0)
+        self.date_next = date_parser_helper(1)
+        self.weekend_pass = date_parser_helper(2)
+        self.two_day_pass = date_parser_helper(3)
+
+        self.black_list = [
+            'черговий викладач',
+            self.date_now,
+            self.date_next,
+            self.weekend_pass,
+            self.two_day_pass
+        ]
+
+    @staticmethod
+    def find_with_table(document):
+        c_element = 2
+        while True:
+            try:
+                document['body']["content"][c_element]["table"]["rows"]
+                break
+            except KeyError:
+                c_element += 1
+                if c_element > 15:
+                    return False
+            except IndexError:
+                return False
+
+        with open("{}/table_element.txt".format(config.config_folder), 'w') as f:
+            f.write(str(c_element))
+            f.close()
+        return c_element
+
+    def find_with_text(self, document):
+        format_charset = '-'
+        alternative_format_charset = "\t"
+        element = 4
+        data = []
+        text = ''
+
+        while element < 15:
+            doc = (
+                document['body']["content"][element]
+                ["paragraph"]["elements"][0]["textRun"]["content"]
+            ).rstrip("\n").replace("–", "-", 1)
+            if (
+                (
+                    ("-" in doc)
+                    #and
+                    #("\t" not in doc)
+                )
+                and
+                ([p not in doc.lower() for p in self.black_list][0])
+            ):
+                try:
+                    group, text = doc.split(format_charset)
+                except ValueError:
+                    if element > 6:
+                        break
+            else:
+                try:
+                    group, text = doc.split(alternative_format_charset)
+                except ValueError:
+                    if element > 6:
+                        break
+            if text != '':
+                data.append(
+                    (group.strip(" "), text.lstrip(" ").replace("\t", ""))
+                )
+            element += 1
+        return data
+
+    def get_date(self, document):
+        date_element = 1
+        while date_element < 16:
+            try:
+                date = (
+                document['body']["content"][date_element]
+                ["paragraph"]["elements"][0]["textRun"]["content"]
+                .rstrip(" \n"))
+            except:
+                date_element += 1
+            if (
+                (
+                    (
+                        self.date_now in date.lower()
+                        .lstrip("заміни").lstrip("на").replace(" ", "")
+                    )
+                    or
+                    (
+                        self.date_next in date.lower()
+                        .lstrip("заміни").lstrip("на").replace(" ", "")
+                    )
+                    or
+                    (
+                        self.weekend_pass in date.lower()
+                        .lstrip("заміни").lstrip("на").replace(" ", "")
+                    )
+                    or
+                    (
+                        self.two_day_pass in date.lower()
+                        .lstrip("заміни").lstrip("на").replace(" ", "")
+                    )
+                )
+                or 
+                (
+                    "заміни на" in date.lower()
+                )
+            ):
+                return date
+            else:
+                date_element += 1
+
+        return False
+
+    @staticmethod
+    def get_table_element():
+        if os.path.exists(f"{config.config_folder}/table_element.txt"):
+            element = int(
+                open(
+                    f"{config.config_folder}/table_element.txt",
+                    'r'
+                    )
+                .read()
+            )
+        else:
+            element = 6
+        return element
+
+    @staticmethod
+    def teacher(document):
+        element = 1
+        while element < 6:
+            if "paragraph" in document['body']["content"][element]:
+                length_element = (len(document['body']["content"][element]
+                    ["paragraph"]["elements"]))
+
+                doc = (
+                    document['body']["content"][element]["paragraph"]["elements"]
+                    [0]["textRun"]["content"].rstrip("\n")
+                )
+                if 'черговий викладач' in doc.lower().replace("–", ""):
+                    return doc
+
+                elif length_element > 1:
+                    for p in range(length_element):
+                        doc = (
+                            document['body']["content"][element]
+                            ["paragraph"]["elements"]
+                            [p]["textRun"]["content"].rstrip("\n")
+                        )
+                        if 'черговий викладач' in doc.lower().replace("–", ""):
+                            return doc
+
+            element += 1
+            
+        
+    @classmethod
+    def get_link_and_download(cls, id_doc, document):
+        if "inlineObjects" in document:
+            if id_doc in document['inlineObjects']:
+                link = (document
+                    ['inlineObjects'][id_doc]['inlineObjectProperties']
+                    ['embeddedObject']['imageProperties']['contentUri'])
+                r = requests.get(link, stream=True)
+                return r.raw
    
-    return output
-
-
-def image_parser(soup: BeautifulSoup):
-    image: Any
-    extension = ('png', 'jpg')
-    main = soup.find("main")
-    for ext in extension:
-        image = main.select(f'img[src$=".{ext}"]')
-        if image:
-            return image[0]['src']
+    @classmethod
+    def find_image(cls, document):
+        for i in document['body']["content"]:
+            if ("paragraph" in i) and ("elements" in i["paragraph"]):
+                if "inlineObjectElement" in i["paragraph"]["elements"]:
+                    return True, cls.get_link_and_download(
+                        i["paragraph"]["elements"]
+                        ['inlineObjectElement']['inlineObjectId'], document)
+        return False, None
+