From 9d602058414dcf2ca9214b1eae6d9e3117f9f46e Mon Sep 17 00:00:00 2001 From: tema Date: Tue, 5 Sep 2023 00:05:17 +0300 Subject: [PATCH] 1 --- parser/parser.py | 148 +++++++++++++++-------- parser/utils.py | 236 ++++++++++++++++++++++++++++++++----- requirements.txt | 63 ++++++++-- website-parser/__init__.py | 2 + website-parser/parser.py | 65 ++++++++++ website-parser/utils.py | 34 ++++++ 6 files changed, 458 insertions(+), 90 deletions(-) create mode 100644 website-parser/__init__.py create mode 100644 website-parser/parser.py create mode 100644 website-parser/utils.py diff --git a/parser/parser.py b/parser/parser.py index 1b11ba8..dbed76e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1,65 +1,117 @@ -import base64 +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +import os import json -import datetime -from datetime import datetime as dt -import requests -from bs4 import BeautifulSoup +from googleapiclient.discovery import build +from google_auth_oauthlib.flow import InstalledAppFlow +from google.auth.transport.requests import Request +from google.oauth2.credentials import Credentials -try: - from load import config -except ImportError: config = None -try: - from .utils import * -except ImportError: - from utils import * +from load import config +from .utils import Helper + +# If modifying these scopes, delete the file token.json. +SCOPES = ['https://www.googleapis.com/auth/documents.readonly'] + +__all__ = ['docs_parse', 'get_about_replacements'] -headers = { - 'user-agent':( - "Mozilla/5.0 (Windows NT 10.0; WOW64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/62.0.3202.9 Safari/537.36" - ) -} - - -def date_parser_helper(days:int, parse:str="%d.%m.20%y"): - return dt.strftime( - dt.now() + - datetime.timedelta(days=days), - parse +def docs_parse() -> None: + creds = None + # The file token.json stores the user's access and refresh tokens, and is + # created automatically when the authorization flow completes for the first + # time. + if os.path.exists(config.token_file): + creds = Credentials.from_authorized_user_file( + config.token_file, + SCOPES ) + # If there are no (valid) credentials available, let the user log in. + if not creds or not creds.valid: + if creds and creds.expired and creds.refresh_token: + creds.refresh(Request()) + else: + flow = InstalledAppFlow.from_client_secrets_file( + config.credentials_file, SCOPES) + creds = flow.run_local_server(port=0) + # Save the credentials for the next run + with open(config.token_file, 'w') as token: + token.write(creds.to_json()) + service = build('docs', 'v1', credentials=creds) -def docs_parse(): - - output = { - "data":{}, - "another_teacher":None - } - - page = requests.get(config.link, headers=headers) - page.encoding = 'utf-8' - - soup = BeautifulSoup(page.text, "lxml") - - # Это в идеале нужно переписать... - url = image_parser(soup) - with requests.get(url=url, allow_redirects=True, stream=True) as r: - output['image'] = True - output['date'] = 'невозможно получить!' - output['data']['all'] = base64.b64encode(r.content).decode('utf-8') - + # Retrieve the documents contents from the Docs service. + document = service.documents().get(documentId=config.documentid).execute() + if os.path.exists(config.data_file): + os.remove(config.data_file) with open(config.data_file, 'w') as f: - json.dump(output, f, ensure_ascii=False) + json.dump(document, f, ensure_ascii=False) f.close() -def get_about_replacements() -> dict: +def read_parse_data(): with open(config.data_file, 'r') as f: data = json.loads(f.read()) f.close() return data -docs_parse() \ No newline at end of file + + +def get_about_replacements() -> dict: + helper = Helper() + document = read_parse_data() + info = [] + element = helper.get_table_element() + + try: + count = document['body']["content"][element]["table"]["rows"] + except (IndexError, KeyError): + image, image_bytes = helper.find_image(document) + if not image: + element = helper.find_with_table(document) + if element: + count = document['body']["content"][element]["table"]["rows"] + else: + info = helper.find_with_text(document) + + date = helper.get_date(document) + + another_teacher = helper.teacher(document) + + if element: + for c in range(0, count): + more_replaces = (document['body'] + ["content"][element]["table"] + ["tableRows"][c]["tableCells"][1] + ["content"] + ) + replaces = '' + for i in range(0, len(more_replaces)): + replaces += (document['body']["content"][element]["table"] + ["tableRows"][c]["tableCells"][1] + ["content"][i]["paragraph"]["elements"][0] + ["textRun"]["content"].rstrip("\n")) + + info.append( + ( + document['body']["content"][element]["table"] + ["tableRows"][c]["tableCells"][0] + ["content"][0]["paragraph"]["elements"][0] + ["textRun"]["content"].rstrip("\n"), + replaces + ) + ) + + if image: + return { + "image": image, + 'date': date if type(date) != type(False) else "Error" , + 'data': {"all": image_bytes}, + 'another_teacher': another_teacher, + } + return { + 'date': date if type(date) != type(False) else "Error" , + 'data': dict(info), + 'another_teacher': another_teacher, + } diff --git a/parser/utils.py b/parser/utils.py index 4c829dd..1240132 100644 --- a/parser/utils.py +++ b/parser/utils.py @@ -1,34 +1,212 @@ -from bs4 import BeautifulSoup -from typing import Any +import os +import datetime +from datetime import datetime as dt -def table_parser(soup: BeautifulSoup, output): - #Date parser - date = (soup.find("main").findAll('span', style="color:black"))[1] - output["date"] = date.text.replace(u'\xa0', u'') +import requests + +from load import config - #Replaces parser - replaces = soup.findAll('tr') - for data in replaces: - - text = ( - data.find("td", valign="top") - .find("span", style="color:black") - .text.replace(u'\xa0', u'') +def date_parser_helper(days:int, parse:str="%d.%m.20%y"): + return dt.strftime( + dt.now() + + datetime.timedelta(days=days), + parse ) - group = ( - data.find("span", style="color:black") - .text.replace(" ", "").replace(u'\xa0', u'')) - output["data"][group] = text + +''' +self.months = { + 1: "січень", + 2: "лютий", + 3: "березень", + 4: "квітень", + 5: "травень", + 6: "червень", + 7: "липень", + 8: "серпень", + 9: "вересень", + 10: "жовтень", + 11: "листопад", + 12: "грудень" +} +''' + +class Helper(): + + def __init__(self): + self.date_now = date_parser_helper(0) + self.date_next = date_parser_helper(1) + self.weekend_pass = date_parser_helper(2) + self.two_day_pass = date_parser_helper(3) + + self.black_list = [ + 'черговий викладач', + self.date_now, + self.date_next, + self.weekend_pass, + self.two_day_pass + ] + + @staticmethod + def find_with_table(document): + c_element = 2 + while True: + try: + document['body']["content"][c_element]["table"]["rows"] + break + except KeyError: + c_element += 1 + if c_element > 15: + return False + except IndexError: + return False + + with open("{}/table_element.txt".format(config.config_folder), 'w') as f: + f.write(str(c_element)) + f.close() + return c_element + + def find_with_text(self, document): + format_charset = '-' + alternative_format_charset = "\t" + element = 4 + data = [] + text = '' + + while element < 15: + doc = ( + document['body']["content"][element] + ["paragraph"]["elements"][0]["textRun"]["content"] + ).rstrip("\n").replace("–", "-", 1) + if ( + ( + ("-" in doc) + #and + #("\t" not in doc) + ) + and + ([p not in doc.lower() for p in self.black_list][0]) + ): + try: + group, text = doc.split(format_charset) + except ValueError: + if element > 6: + break + else: + try: + group, text = doc.split(alternative_format_charset) + except ValueError: + if element > 6: + break + if text != '': + data.append( + (group.strip(" "), text.lstrip(" ").replace("\t", "")) + ) + element += 1 + return data + + def get_date(self, document): + date_element = 1 + while date_element < 16: + try: + date = ( + document['body']["content"][date_element] + ["paragraph"]["elements"][0]["textRun"]["content"] + .rstrip(" \n")) + except: + date_element += 1 + if ( + ( + ( + self.date_now in date.lower() + .lstrip("заміни").lstrip("на").replace(" ", "") + ) + or + ( + self.date_next in date.lower() + .lstrip("заміни").lstrip("на").replace(" ", "") + ) + or + ( + self.weekend_pass in date.lower() + .lstrip("заміни").lstrip("на").replace(" ", "") + ) + or + ( + self.two_day_pass in date.lower() + .lstrip("заміни").lstrip("на").replace(" ", "") + ) + ) + or + ( + "заміни на" in date.lower() + ) + ): + return date + else: + date_element += 1 + + return False + + @staticmethod + def get_table_element(): + if os.path.exists(f"{config.config_folder}/table_element.txt"): + element = int( + open( + f"{config.config_folder}/table_element.txt", + 'r' + ) + .read() + ) + else: + element = 6 + return element + + @staticmethod + def teacher(document): + element = 1 + while element < 6: + if "paragraph" in document['body']["content"][element]: + length_element = (len(document['body']["content"][element] + ["paragraph"]["elements"])) + + doc = ( + document['body']["content"][element]["paragraph"]["elements"] + [0]["textRun"]["content"].rstrip("\n") + ) + if 'черговий викладач' in doc.lower().replace("–", ""): + return doc + + elif length_element > 1: + for p in range(length_element): + doc = ( + document['body']["content"][element] + ["paragraph"]["elements"] + [p]["textRun"]["content"].rstrip("\n") + ) + if 'черговий викладач' in doc.lower().replace("–", ""): + return doc + + element += 1 + + + @classmethod + def get_link_and_download(cls, id_doc, document): + if "inlineObjects" in document: + if id_doc in document['inlineObjects']: + link = (document + ['inlineObjects'][id_doc]['inlineObjectProperties'] + ['embeddedObject']['imageProperties']['contentUri']) + r = requests.get(link, stream=True) + return r.raw - return output - - -def image_parser(soup: BeautifulSoup): - image: Any - extension = ('png', 'jpg') - main = soup.find("main") - for ext in extension: - image = main.select(f'img[src$=".{ext}"]') - if image: - return image[0]['src'] + @classmethod + def find_image(cls, document): + for i in document['body']["content"]: + if ("paragraph" in i) and ("elements" in i["paragraph"]): + if "inlineObjectElement" in i["paragraph"]["elements"]: + return True, cls.get_link_and_download( + i["paragraph"]["elements"] + ['inlineObjectElement']['inlineObjectId'], document) + return False, None + diff --git a/requirements.txt b/requirements.txt index 395417e..34511ac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,50 @@ -#google-api-python-client -#google-auth-httplib2 -#google-auth-oauthlib -bs4 -requests -GitPython -lxml -peewee -aiogram -cryptography -pymysqldb -psycopg2 -aioschedule +aiogram==2.25.1 +aiohttp==3.8.4 +aioschedule==0.5.2 +aiosignal==1.3.1 +async-timeout==4.0.2 +attrs==22.2.0 +Babel==2.9.1 +beautifulsoup4==4.11.2 +bs4==0.0.1 +cachetools==5.3.1 +certifi==2022.12.7 +cffi==1.15.1 +charset-normalizer==3.0.1 +cryptography==39.0.1 +easydict==1.10 +frozenlist==1.3.3 +gitdb==4.0.10 +GitPython==3.1.30 +google-api-core==2.11.1 +google-api-python-client==2.97.0 +google-auth==2.22.0 +google-auth-httplib2==0.1.0 +google-auth-oauthlib==1.0.0 +googleapis-common-protos==1.60.0 +httplib2==0.22.0 +idna==3.4 +lxml==4.9.2 +magic-filter==1.0.9 +multidict==6.0.4 +oauthlib==3.2.2 +peewee==3.15.4 +protobuf==4.24.2 +psycopg2-binary==2.9.5 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pycparser==2.21 +PyMySQL==1.0.2 +PyMysqlDB==0.0.2 +pyparsing==3.1.1 +PySocks==1.7.1 +pytz==2022.7.1 +requests==2.31.0 +requests-oauthlib==1.3.1 +rsa==4.9 +six==1.16.0 +smmap==5.0.0 +soupsieve==2.3.2.post1 +uritemplate==4.1.1 +urllib3==1.26.14 +yarl==1.8.2 diff --git a/website-parser/__init__.py b/website-parser/__init__.py new file mode 100644 index 0000000..3188cf5 --- /dev/null +++ b/website-parser/__init__.py @@ -0,0 +1,2 @@ +from .parser import get_about_replacements, docs_parse +__all__ = ['get_about_replacements', 'docs_parse'] diff --git a/website-parser/parser.py b/website-parser/parser.py new file mode 100644 index 0000000..1b11ba8 --- /dev/null +++ b/website-parser/parser.py @@ -0,0 +1,65 @@ +import base64 +import json +import datetime +from datetime import datetime as dt + +import requests +from bs4 import BeautifulSoup + +try: + from load import config +except ImportError: config = None +try: + from .utils import * +except ImportError: + from utils import * + + +headers = { + 'user-agent':( + "Mozilla/5.0 (Windows NT 10.0; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/62.0.3202.9 Safari/537.36" + ) +} + + +def date_parser_helper(days:int, parse:str="%d.%m.20%y"): + return dt.strftime( + dt.now() + + datetime.timedelta(days=days), + parse + ) + + +def docs_parse(): + + output = { + "data":{}, + "another_teacher":None + } + + page = requests.get(config.link, headers=headers) + page.encoding = 'utf-8' + + soup = BeautifulSoup(page.text, "lxml") + + # Это в идеале нужно переписать... + url = image_parser(soup) + with requests.get(url=url, allow_redirects=True, stream=True) as r: + output['image'] = True + output['date'] = 'невозможно получить!' + output['data']['all'] = base64.b64encode(r.content).decode('utf-8') + + + with open(config.data_file, 'w') as f: + json.dump(output, f, ensure_ascii=False) + f.close() + + +def get_about_replacements() -> dict: + with open(config.data_file, 'r') as f: + data = json.loads(f.read()) + f.close() + return data +docs_parse() \ No newline at end of file diff --git a/website-parser/utils.py b/website-parser/utils.py new file mode 100644 index 0000000..4c829dd --- /dev/null +++ b/website-parser/utils.py @@ -0,0 +1,34 @@ +from bs4 import BeautifulSoup +from typing import Any + +def table_parser(soup: BeautifulSoup, output): + #Date parser + date = (soup.find("main").findAll('span', style="color:black"))[1] + output["date"] = date.text.replace(u'\xa0', u'') + + + #Replaces parser + replaces = soup.findAll('tr') + for data in replaces: + + text = ( + data.find("td", valign="top") + .find("span", style="color:black") + .text.replace(u'\xa0', u'') + ) + group = ( + data.find("span", style="color:black") + .text.replace(" ", "").replace(u'\xa0', u'')) + output["data"][group] = text + + return output + + +def image_parser(soup: BeautifulSoup): + image: Any + extension = ('png', 'jpg') + main = soup.find("main") + for ext in extension: + image = main.select(f'img[src$=".{ext}"]') + if image: + return image[0]['src']