testing #15

Closed
tema wants to merge 3 commits from testing into master
8 changed files with 487 additions and 94 deletions

View File

@ -67,6 +67,8 @@ def get_about_replacements() -> dict:
try: try:
count = document['body']["content"][element]["table"]["rows"] count = document['body']["content"][element]["table"]["rows"]
except (IndexError, KeyError): except (IndexError, KeyError):
image, image_bytes = helper.find_image(document)
if not image:
element = helper.find_with_table(document) element = helper.find_with_table(document)
if element: if element:
count = document['body']["content"][element]["table"]["rows"] count = document['body']["content"][element]["table"]["rows"]
@ -101,6 +103,13 @@ def get_about_replacements() -> dict:
) )
) )
if image:
return {
"image": image,
'date': date if type(date) != type(False) else "Error" ,
'data': {"all": image_bytes},
'another_teacher': another_teacher,
}
return { return {
'date': date if type(date) != type(False) else "Error" , 'date': date if type(date) != type(False) else "Error" ,
'data': dict(info), 'data': dict(info),

View File

@ -1,6 +1,9 @@
import os import os
import datetime import datetime
from datetime import datetime as dt from datetime import datetime as dt
import base64
import requests
from load import config from load import config
@ -186,3 +189,25 @@ class Helper():
return doc return doc
element += 1 element += 1
@classmethod
def get_link_and_download(cls, id_doc, document):
if "inlineObjects" in document:
if id_doc in document['inlineObjects']:
link = (document
['inlineObjects'][id_doc]['inlineObjectProperties']
['embeddedObject']['imageProperties']['contentUri'])
r = requests.get(link, stream=True)
return base64.b64encode(r.content).decode('utf-8')
@classmethod
def find_image(cls, document):
for i in document['body']["content"]:
if ("paragraph" in i) and ("elements" in i["paragraph"]):
if "inlineObjectElement" in i["paragraph"]["elements"]:
return True, cls.get_link_and_download(
i["paragraph"]["elements"]
['inlineObjectElement']['inlineObjectId'], document)
return False, None

View File

@ -1,65 +1,117 @@
import base64 #!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import json import json
import datetime
from datetime import datetime as dt
import requests from googleapiclient.discovery import build
from bs4 import BeautifulSoup from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
try: from load import config
from load import config from .utils import Helper
except ImportError: config = None
try: # If modifying these scopes, delete the file token.json.
from .utils import * SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
except ImportError:
from utils import * __all__ = ['docs_parse', 'get_about_replacements']
headers = { def docs_parse() -> None:
'user-agent':( creds = None
"Mozilla/5.0 (Windows NT 10.0; WOW64) " # The file token.json stores the user's access and refresh tokens, and is
"AppleWebKit/537.36 (KHTML, like Gecko) " # created automatically when the authorization flow completes for the first
"Chrome/62.0.3202.9 Safari/537.36" # time.
if os.path.exists(config.token_file):
creds = Credentials.from_authorized_user_file(
config.token_file,
SCOPES
) )
} # If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
config.credentials_file, SCOPES)
creds = flow.run_local_server(port=0)
# Save the credentials for the next run
with open(config.token_file, 'w') as token:
token.write(creds.to_json())
service = build('docs', 'v1', credentials=creds)
def date_parser_helper(days:int, parse:str="%d.%m.20%y"): # Retrieve the documents contents from the Docs service.
return dt.strftime( document = service.documents().get(documentId=config.documentid).execute()
dt.now() + if os.path.exists(config.data_file):
datetime.timedelta(days=days), os.remove(config.data_file)
parse
)
def docs_parse():
output = {
"data":{},
"another_teacher":None
}
page = requests.get(config.link, headers=headers)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
url = image_parser(soup)
with requests.get(url=url, allow_redirects=True, stream=True) as r:
output['image'] = True
output['date'] = 'невозможно получить!'
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
with open(config.data_file, 'w') as f: with open(config.data_file, 'w') as f:
json.dump(output, f, ensure_ascii=False) json.dump(document, f, ensure_ascii=False)
f.close() f.close()
def get_about_replacements() -> dict: def read_parse_data():
with open(config.data_file, 'r') as f: with open(config.data_file, 'r') as f:
data = json.loads(f.read()) data = json.loads(f.read())
f.close() f.close()
return data return data
docs_parse()
def get_about_replacements() -> dict:
helper = Helper()
document = read_parse_data()
info = []
element = helper.get_table_element()
try:
count = document['body']["content"][element]["table"]["rows"]
except (IndexError, KeyError):
image, image_bytes = helper.find_image(document)
if not image:
element = helper.find_with_table(document)
if element:
count = document['body']["content"][element]["table"]["rows"]
else:
info = helper.find_with_text(document)
date = helper.get_date(document)
another_teacher = helper.teacher(document)
if element:
for c in range(0, count):
more_replaces = (document['body']
["content"][element]["table"]
["tableRows"][c]["tableCells"][1]
["content"]
)
replaces = ''
for i in range(0, len(more_replaces)):
replaces += (document['body']["content"][element]["table"]
["tableRows"][c]["tableCells"][1]
["content"][i]["paragraph"]["elements"][0]
["textRun"]["content"].rstrip("\n"))
info.append(
(
document['body']["content"][element]["table"]
["tableRows"][c]["tableCells"][0]
["content"][0]["paragraph"]["elements"][0]
["textRun"]["content"].rstrip("\n"),
replaces
)
)
if image:
return {
"image": image,
'date': date if type(date) != type(False) else "Error" ,
'data': {"all": image_bytes},
'another_teacher': another_teacher,
}
return {
'date': date if type(date) != type(False) else "Error" ,
'data': dict(info),
'another_teacher': another_teacher,
}

View File

@ -1,34 +1,203 @@
from bs4 import BeautifulSoup import os
from typing import Any import datetime
from datetime import datetime as dt
import base64
def table_parser(soup: BeautifulSoup, output): import requests
#Date parser
date = (soup.find("main").findAll('span', style="color:black"))[1] from load import config
output["date"] = date.text.replace(u'\xa0', u'')
#Replaces parser def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
replaces = soup.findAll('tr') return dt.strftime(
for data in replaces: dt.now() +
datetime.timedelta(days=days),
text = ( parse
data.find("td", valign="top")
.find("span", style="color:black")
.text.replace(u'\xa0', u'')
) )
group = (
data.find("span", style="color:black")
.text.replace(" ", "").replace(u'\xa0', u''))
output["data"][group] = text
return output '''
self.months = {
1: "січень",
2: "лютий",
3: "березень",
4: "квітень",
5: "травень",
6: "червень",
7: "липень",
8: "серпень",
9: "вересень",
10: "жовтень",
11: "листопад",
12: "грудень"
}
'''
class Helper():
def __init__(self):
self.date_now = date_parser_helper(0)
self.date_next = date_parser_helper(1)
self.weekend_pass = date_parser_helper(2)
self.two_day_pass = date_parser_helper(3)
self.black_list = [
'черговий викладач',
self.date_now,
self.date_next,
self.weekend_pass,
self.two_day_pass
]
@staticmethod
def find_with_table(document):
c_element = 2
while True:
try:
document['body']["content"][c_element]["table"]["rows"]
break
except KeyError:
c_element += 1
if c_element > 15:
return False
except IndexError:
return False
with open("{}/table_element.txt".format(config.config_folder), 'w') as f:
f.write(str(c_element))
f.close()
return c_element
def find_with_text(self, document):
format_charset = '-'
alternative_format_charset = "\t"
element = 4
data = []
text = ''
while element < 15:
doc = (
document['body']["content"][element]
["paragraph"]["elements"][0]["textRun"]["content"]
).rstrip("\n").replace("", "-", 1)
if (
(
("-" in doc)
#and
#("\t" not in doc)
)
and
([p not in doc.lower() for p in self.black_list][0])
):
try:
group, text = doc.split(format_charset)
except ValueError:
if element > 6:
break
else:
try:
group, text = doc.split(alternative_format_charset)
except ValueError:
if element > 6:
break
if text != '':
data.append(
(group.strip(" "), text.lstrip(" ").replace("\t", ""))
)
element += 1
return data
def get_date(self, document):
date_element = 1
while date_element < 16:
try:
date = (
document['body']["content"][date_element]
["paragraph"]["elements"][0]["textRun"]["content"]
.rstrip(" \n"))
except:
date_element += 1
if (
(
(
self.date_now in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
or
(
self.date_next in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
or
(
self.weekend_pass in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
or
(
self.two_day_pass in date.lower()
.lstrip("заміни").lstrip("на").replace(" ", "")
)
)
or
(
"заміни на" in date.lower()
)
):
return date
else:
date_element += 1
return False
@staticmethod
def get_table_element():
if os.path.exists(f"{config.config_folder}/table_element.txt"):
element = int(
open(
f"{config.config_folder}/table_element.txt",
'r'
)
.read()
)
else:
element = 6
return element
@staticmethod
def teacher(document):
element = 1
while element < 6:
if "paragraph" in document['body']["content"][element]:
length_element = (len(document['body']["content"][element]
["paragraph"]["elements"]))
doc = (
document['body']["content"][element]["paragraph"]["elements"]
[0]["textRun"]["content"].rstrip("\n")
)
if 'черговий викладач' in doc.lower().replace("", ""):
return doc
elif length_element > 1:
for p in range(length_element):
doc = (
document['body']["content"][element]
["paragraph"]["elements"]
[p]["textRun"]["content"].rstrip("\n")
)
if 'черговий викладач' in doc.lower().replace("", ""):
return doc
element += 1
def image_parser(soup: BeautifulSoup):
image: Any @classmethod
extension = ('png', 'jpg') def find_image(cls, document):
main = soup.find("main") for i in document['body']["content"]:
for ext in extension: if ("paragraph" in i) and ("elements" in i["paragraph"]):
image = main.select(f'img[src$=".{ext}"]') if "inlineObjectElement" in i["paragraph"]["elements"][0]:
if image: import base64
return image[0]['src'] return True, base64.b64encode(open("photo.png", 'rb').read()).decode('utf-8')
return False, None

View File

@ -1,13 +1,50 @@
#google-api-python-client aiogram==2.25.1
#google-auth-httplib2 aiohttp==3.8.4
#google-auth-oauthlib aioschedule==0.5.2
bs4 aiosignal==1.3.1
requests async-timeout==4.0.2
GitPython attrs==22.2.0
lxml Babel==2.9.1
peewee beautifulsoup4==4.11.2
aiogram bs4==0.0.1
cryptography cachetools==5.3.1
pymysqldb certifi==2022.12.7
psycopg2 cffi==1.15.1
aioschedule charset-normalizer==3.0.1
cryptography==39.0.1
easydict==1.10
frozenlist==1.3.3
gitdb==4.0.10
GitPython==3.1.30
google-api-core==2.11.1
google-api-python-client==2.97.0
google-auth==2.22.0
google-auth-httplib2==0.1.0
google-auth-oauthlib==1.0.0
googleapis-common-protos==1.60.0
httplib2==0.22.0
idna==3.4
lxml==4.9.2
magic-filter==1.0.9
multidict==6.0.4
oauthlib==3.2.2
peewee==3.15.4
protobuf==4.24.2
psycopg2-binary==2.9.5
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
PyMySQL==1.0.2
PyMysqlDB==0.0.2
pyparsing==3.1.1
PySocks==1.7.1
pytz==2022.7.1
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
six==1.16.0
smmap==5.0.0
soupsieve==2.3.2.post1
uritemplate==4.1.1
urllib3==1.26.14
yarl==1.8.2

View File

@ -0,0 +1,2 @@
from .parser import get_about_replacements, docs_parse
__all__ = ['get_about_replacements', 'docs_parse']

65
website-parser/parser.py Normal file
View File

@ -0,0 +1,65 @@
import base64
import json
import datetime
from datetime import datetime as dt
import requests
from bs4 import BeautifulSoup
try:
from load import config
except ImportError: config = None
try:
from .utils import *
except ImportError:
from utils import *
headers = {
'user-agent':(
"Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.9 Safari/537.36"
)
}
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
return dt.strftime(
dt.now() +
datetime.timedelta(days=days),
parse
)
def docs_parse():
output = {
"data":{},
"another_teacher":None
}
page = requests.get(config.link, headers=headers)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
url = image_parser(soup)
with requests.get(url=url, allow_redirects=True, stream=True) as r:
output['image'] = True
output['date'] = 'невозможно получить!'
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
with open(config.data_file, 'w') as f:
json.dump(output, f, ensure_ascii=False)
f.close()
def get_about_replacements() -> dict:
with open(config.data_file, 'r') as f:
data = json.loads(f.read())
f.close()
return data
docs_parse()

34
website-parser/utils.py Normal file
View File

@ -0,0 +1,34 @@
from bs4 import BeautifulSoup
from typing import Any
def table_parser(soup: BeautifulSoup, output):
#Date parser
date = (soup.find("main").findAll('span', style="color:black"))[1]
output["date"] = date.text.replace(u'\xa0', u'')
#Replaces parser
replaces = soup.findAll('tr')
for data in replaces:
text = (
data.find("td", valign="top")
.find("span", style="color:black")
.text.replace(u'\xa0', u'')
)
group = (
data.find("span", style="color:black")
.text.replace(" ", "").replace(u'\xa0', u''))
output["data"][group] = text
return output
def image_parser(soup: BeautifulSoup):
image: Any
extension = ('png', 'jpg')
main = soup.find("main")
for ext in extension:
image = main.select(f'img[src$=".{ext}"]')
if image:
return image[0]['src']