This commit is contained in:
tema 2023-09-05 00:05:17 +03:00
parent 8754cf841e
commit 9d60205841
Signed by: tema
GPG Key ID: 21FDB6D162488F6F
6 changed files with 458 additions and 90 deletions

View File

@ -1,65 +1,117 @@
import base64 #!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import json import json
import datetime
from datetime import datetime as dt
import requests from googleapiclient.discovery import build
from bs4 import BeautifulSoup from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
try: from load import config
from load import config from .utils import Helper
except ImportError: config = None
try: # If modifying these scopes, delete the file token.json.
from .utils import * SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
except ImportError:
from utils import * __all__ = ['docs_parse', 'get_about_replacements']
headers = { def docs_parse() -> None:
'user-agent':( creds = None
"Mozilla/5.0 (Windows NT 10.0; WOW64) " # The file token.json stores the user's access and refresh tokens, and is
"AppleWebKit/537.36 (KHTML, like Gecko) " # created automatically when the authorization flow completes for the first
"Chrome/62.0.3202.9 Safari/537.36" # time.
) if os.path.exists(config.token_file):
} creds = Credentials.from_authorized_user_file(
config.token_file,
SCOPES
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
return dt.strftime(
dt.now() +
datetime.timedelta(days=days),
parse
) )
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
config.credentials_file, SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open(config.token_file, 'w') as token:
token.write(creds.to_json())
service = build('docs', 'v1', credentials=creds)
def docs_parse(): # Retrieve the documents contents from the Docs service.
document = service.documents().get(documentId=config.documentid).execute()
output = { if os.path.exists(config.data_file):
"data":{}, os.remove(config.data_file)
"another_teacher":None
}
page = requests.get(config.link, headers=headers)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
url = image_parser(soup)
with requests.get(url=url, allow_redirects=True, stream=True) as r:
output['image'] = True
output['date'] = 'невозможно получить!'
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
with open(config.data_file, 'w') as f: with open(config.data_file, 'w') as f:
json.dump(output, f, ensure_ascii=False) json.dump(document, f, ensure_ascii=False)
f.close() f.close()
def get_about_replacements() -> dict: def read_parse_data():
with open(config.data_file, 'r') as f: with open(config.data_file, 'r') as f:
data = json.loads(f.read()) data = json.loads(f.read())
f.close() f.close()
return data return data
docs_parse()
def get_about_replacements() -> dict:
helper = Helper()
document = read_parse_data()
info = []
element = helper.get_table_element()
try:
count = document['body']["content"][element]["table"]["rows"]
except (IndexError, KeyError):
image, image_bytes = helper.find_image(document)
if not image:
element = helper.find_with_table(document)
if element:
count = document['body']["content"][element]["table"]["rows"]
else:
info = helper.find_with_text(document)
date = helper.get_date(document)
another_teacher = helper.teacher(document)
if element:
for c in range(0, count):
more_replaces = (document['body']
["content"][element]["table"]
["tableRows"][c]["tableCells"][1]
["content"]
)
replaces = ''
for i in range(0, len(more_replaces)):
replaces += (document['body']["content"][element]["table"]
["tableRows"][c]["tableCells"][1]
["content"][i]["paragraph"]["elements"][0]
["textRun"]["content"].rstrip("\n"))
info.append(
(
document['body']["content"][element]["table"]
["tableRows"][c]["tableCells"][0]
["content"][0]["paragraph"]["elements"][0]
["textRun"]["content"].rstrip("\n"),
replaces
)
)
if image:
return {
"image": image,
'date': date if type(date) != type(False) else "Error" ,
'data': {"all": image_bytes},
'another_teacher': another_teacher,
}
return {
'date': date if type(date) != type(False) else "Error" ,
'data': dict(info),
'another_teacher': another_teacher,
}

View File

@ -1,34 +1,212 @@
from bs4 import BeautifulSoup import os
from typing import Any import datetime
from datetime import datetime as dt
def table_parser(soup: BeautifulSoup, output): import requests
#Date parser
date = (soup.find("main").findAll('span', style="color:black"))[1] from load import config
output["date"] = date.text.replace(u'\xa0', u'')
#Replaces parser def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
replaces = soup.findAll('tr') return dt.strftime(
for data in replaces: dt.now() +
datetime.timedelta(days=days),
text = ( parse
data.find("td", valign="top")
.find("span", style="color:black")
.text.replace(u'\xa0', u'')
) )
group = (
data.find("span", style="color:black") '''
.text.replace(" ", "").replace(u'\xa0', u'')) self.months = {
output["data"][group] = text 1: "січень",
2: "лютий",
3: "березень",
4: "квітень",
5: "травень",
6: "червень",
7: "липень",
8: "серпень",
9: "вересень",
10: "жовтень",
11: "листопад",
12: "грудень"
}
'''
class Helper():
def __init__(self):
self.date_now = date_parser_helper(0)
self.date_next = date_parser_helper(1)
self.weekend_pass = date_parser_helper(2)
self.two_day_pass = date_parser_helper(3)
self.black_list = [
'черговий викладач',
self.date_now,
self.date_next,
self.weekend_pass,
self.two_day_pass
]
@staticmethod
def find_with_table(document):
c_element = 2
while True:
try:
document['body']["content"][c_element]["table"]["rows"]
break
except KeyError:
c_element += 1
if c_element > 15:
return False
except IndexError:
return False
with open("{}/table_element.txt".format(config.config_folder), 'w') as f:
f.write(str(c_element))
f.close()
return c_element
def find_with_text(self, document):
format_charset = '-'
alternative_format_charset = "\t"
element = 4
data = []
text = ''
while element < 15:
doc = (
document['body']["content"][element]
["paragraph"]["elements"][0]["textRun"]["content"]
).rstrip("\n").replace("", "-", 1)
if (
(
("-" in doc)
#and
#("\t" not in doc)
)
and
([p not in doc.lower() for p in self.black_list][0])
):
try:
group, text = doc.split(format_charset)
except ValueError:
if element > 6:
break
else:
try:
group, text = doc.split(alternative_format_charset)
except ValueError:
if element > 6:
break
if text != '':
data.append(
(group.strip(" "), text.lstrip(" ").replace("\t", ""))
)
element += 1
return data
def get_date(self, document):
date_element = 1
while date_element < 16:
try:
date = (
document['body']["content"][date_element]
["paragraph"]["elements"][0]["textRun"]["content"]
.rstrip(" \n"))
except:
date_element += 1
if (
(
(
self.date_now in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
or
(
self.date_next in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
or
(
self.weekend_pass in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
or
(
self.two_day_pass in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
)
or
(
"заміни на" in date.lower()
)
):
return date
else:
date_element += 1
return False
@staticmethod
def get_table_element():
if os.path.exists(f"{config.config_folder}/table_element.txt"):
element = int(
open(
f"{config.config_folder}/table_element.txt",
'r'
)
.read()
)
else:
element = 6
return element
@staticmethod
def teacher(document):
element = 1
while element < 6:
if "paragraph" in document['body']["content"][element]:
length_element = (len(document['body']["content"][element]
["paragraph"]["elements"]))
doc = (
document['body']["content"][element]["paragraph"]["elements"]
[0]["textRun"]["content"].rstrip("\n")
)
if 'черговий викладач' in doc.lower().replace("", ""):
return doc
elif length_element > 1:
for p in range(length_element):
doc = (
document['body']["content"][element]
["paragraph"]["elements"]
[p]["textRun"]["content"].rstrip("\n")
)
if 'черговий викладач' in doc.lower().replace("", ""):
return doc
element += 1
@classmethod
def get_link_and_download(cls, id_doc, document):
if "inlineObjects" in document:
if id_doc in document['inlineObjects']:
link = (document
['inlineObjects'][id_doc]['inlineObjectProperties']
['embeddedObject']['imageProperties']['contentUri'])
r = requests.get(link, stream=True)
return r.raw
return output @classmethod
def find_image(cls, document):
for i in document['body']["content"]:
def image_parser(soup: BeautifulSoup): if ("paragraph" in i) and ("elements" in i["paragraph"]):
image: Any if "inlineObjectElement" in i["paragraph"]["elements"]:
extension = ('png', 'jpg') return True, cls.get_link_and_download(
main = soup.find("main") i["paragraph"]["elements"]
for ext in extension: ['inlineObjectElement']['inlineObjectId'], document)
image = main.select(f'img[src$=".{ext}"]') return False, None
if image:
return image[0]['src']

View File

@ -1,13 +1,50 @@
#google-api-python-client aiogram==2.25.1
#google-auth-httplib2 aiohttp==3.8.4
#google-auth-oauthlib aioschedule==0.5.2
bs4 aiosignal==1.3.1
requests async-timeout==4.0.2
GitPython attrs==22.2.0
lxml Babel==2.9.1
peewee beautifulsoup4==4.11.2
aiogram bs4==0.0.1
cryptography cachetools==5.3.1
pymysqldb certifi==2022.12.7
psycopg2 cffi==1.15.1
aioschedule charset-normalizer==3.0.1
cryptography==39.0.1
easydict==1.10
frozenlist==1.3.3
gitdb==4.0.10
GitPython==3.1.30
google-api-core==2.11.1
google-api-python-client==2.97.0
google-auth==2.22.0
google-auth-httplib2==0.1.0
google-auth-oauthlib==1.0.0
googleapis-common-protos==1.60.0
httplib2==0.22.0
idna==3.4
lxml==4.9.2
magic-filter==1.0.9
multidict==6.0.4
oauthlib==3.2.2
peewee==3.15.4
protobuf==4.24.2
psycopg2-binary==2.9.5
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
PyMySQL==1.0.2
PyMysqlDB==0.0.2
pyparsing==3.1.1
PySocks==1.7.1
pytz==2022.7.1
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
six==1.16.0
smmap==5.0.0
soupsieve==2.3.2.post1
uritemplate==4.1.1
urllib3==1.26.14
yarl==1.8.2

View File

@ -0,0 +1,2 @@
from .parser import get_about_replacements, docs_parse
__all__ = ['get_about_replacements', 'docs_parse']

65
website-parser/parser.py Normal file
View File

@ -0,0 +1,65 @@
import base64
import json
import datetime
from datetime import datetime as dt
import requests
from bs4 import BeautifulSoup
try:
from load import config
except ImportError: config = None
try:
from .utils import *
except ImportError:
from utils import *
headers = {
'user-agent':(
"Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.9 Safari/537.36"
)
}
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
return dt.strftime(
dt.now() +
datetime.timedelta(days=days),
parse
)
def docs_parse():
output = {
"data":{},
"another_teacher":None
}
page = requests.get(config.link, headers=headers)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
url = image_parser(soup)
with requests.get(url=url, allow_redirects=True, stream=True) as r:
output['image'] = True
output['date'] = 'невозможно получить!'
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
with open(config.data_file, 'w') as f:
json.dump(output, f, ensure_ascii=False)
f.close()
def get_about_replacements() -> dict:
with open(config.data_file, 'r') as f:
data = json.loads(f.read())
f.close()
return data
docs_parse()

34
website-parser/utils.py Normal file
View File

@ -0,0 +1,34 @@
from bs4 import BeautifulSoup
from typing import Any
def table_parser(soup: BeautifulSoup, output):
#Date parser
date = (soup.find("main").findAll('span', style="color:black"))[1]
output["date"] = date.text.replace(u'\xa0', u'')
#Replaces parser
replaces = soup.findAll('tr')
for data in replaces:
text = (
data.find("td", valign="top")
.find("span", style="color:black")
.text.replace(u'\xa0', u'')
)
group = (
data.find("span", style="color:black")
.text.replace(" ", "").replace(u'\xa0', u''))
output["data"][group] = text
return output
def image_parser(soup: BeautifulSoup):
image: Any
extension = ('png', 'jpg')
main = soup.find("main")
for ext in extension:
image = main.select(f'img[src$=".{ext}"]')
if image:
return image[0]['src']