G parser
This commit is contained in:
parent
437ed25e84
commit
26b5028359
117
1.p
Normal file
117
1.p
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
diff --git a/engineering_works.py b/engineering_works.py
|
||||||
|
index a122c3c..ebd6cea 100644
|
||||||
|
--- a/engineering_works.py
|
||||||
|
+++ b/engineering_works.py
|
||||||
|
@@ -13,11 +13,11 @@ logging.basicConfig(
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
-WEBAPP_HOST = config.bot("ip")
|
||||||
|
-WEBAPP_PORT = config.bot("port")
|
||||||
|
+WEBAPP_HOST = config.ip
|
||||||
|
+WEBAPP_PORT = config.port
|
||||||
|
|
||||||
|
WEBHOOK_HOST = f'http://{WEBAPP_HOST}:{WEBAPP_PORT}'
|
||||||
|
-WEBHOOK_PATH = f'/bot{config.bot("token")}/'
|
||||||
|
+WEBHOOK_PATH = f'/bot{config.token}/'
|
||||||
|
WEBHOOK_URL = f"{WEBHOOK_HOST}{WEBHOOK_PATH}"
|
||||||
|
|
||||||
|
engeneerings_works = (
|
||||||
|
@@ -29,7 +29,7 @@ parse_error = (
|
||||||
|
"Бот приостановлен на неопределенный срок!\n"
|
||||||
|
"Что случилось?\n"
|
||||||
|
"Администрация коледжа изменила формат файла с google docs на docx(Microsoft Office)\n"
|
||||||
|
- "Замены вы можете посмотреть тут: https://docs.google.com/document/d/{}".format(config.documentid)
|
||||||
|
+ "Замены вы можете посмотреть тут: https://docs.google.com/document/d/{}".format("")
|
||||||
|
)
|
||||||
|
|
||||||
|
new_year = (
|
||||||
|
@@ -48,7 +48,11 @@ september_1 = ("Всіх з 1 вересням, всього найкращог
|
||||||
|
"Бот буде запущений чуть пізніше, "
|
||||||
|
"коли заміни будуть публіковаться текстом")
|
||||||
|
|
||||||
|
-send_msg = the_end
|
||||||
|
+upd_1 = ("Невеликі зміни в боті.\n"
|
||||||
|
+ "1. Добавлени донати, тепер ви можете підтримати автора бота\n"
|
||||||
|
+ "2. Добалено звязок з адміністратором")
|
||||||
|
+
|
||||||
|
+send_msg = upd_1
|
||||||
|
|
||||||
|
async def on_startup(dp):
|
||||||
|
await bot.set_webhook(url=WEBHOOK_URL)
|
||||||
|
@@ -81,7 +85,7 @@ async def start(message: types.Message):
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
- if config.bot("use_webhook").lower() in ['t', 'true', '1', 'yes', 'y']:
|
||||||
|
+ if config.use_webhook.lower() in ['t', 'true', '1', 'yes', 'y']:
|
||||||
|
executor.start_webhook(
|
||||||
|
dispatcher=dp,
|
||||||
|
webhook_path=WEBHOOK_PATH,
|
||||||
|
diff --git a/filters/main.py b/filters/main.py
|
||||||
|
index 849e6de..7539de0 100644
|
||||||
|
--- a/filters/main.py
|
||||||
|
+++ b/filters/main.py
|
||||||
|
@@ -28,7 +28,7 @@ class BotAdmin(BoundFilter):
|
||||||
|
self.admin = admin
|
||||||
|
|
||||||
|
async def check(self, message: types.Message):
|
||||||
|
- if message.from_user.id in config.admin_user:
|
||||||
|
+ if message.from_user.id in [int(i) for i in config.admin_users.split(",")]:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
await message.answer("Хорошая попытка, но ты не администратор!")
|
||||||
|
diff --git a/handlers/private/main.py b/handlers/private/main.py
|
||||||
|
index de03596..370218a 100644
|
||||||
|
--- a/handlers/private/main.py
|
||||||
|
+++ b/handlers/private/main.py
|
||||||
|
@@ -45,7 +45,7 @@ async def get_replace(message: types.Message, state: FSMContext):
|
||||||
|
|
||||||
|
link = (
|
||||||
|
'<a href="{}">Проверьте замены тут</a>'
|
||||||
|
- .format(config.bot("link"))
|
||||||
|
+ .format(config.link)
|
||||||
|
)
|
||||||
|
logging.info("User: {user_id} - {username}".format(
|
||||||
|
user_id=str(message.from_user.id),
|
||||||
|
@@ -94,7 +94,7 @@ async def get_replace(message: types.Message, state: FSMContext):
|
||||||
|
async def get_link(message: types.Message):
|
||||||
|
msg = (
|
||||||
|
'<a href="{}">Проверьте замены тут</a>'
|
||||||
|
- .format(config.bot("link"))
|
||||||
|
+ .format(config.link)
|
||||||
|
)
|
||||||
|
await bot.send_message(
|
||||||
|
message.chat.id,
|
||||||
|
diff --git a/utils/announcements.py b/utils/announcements.py
|
||||||
|
index f70541e..4e264de 100644
|
||||||
|
--- a/utils/announcements.py
|
||||||
|
+++ b/utils/announcements.py
|
||||||
|
@@ -1,3 +1,4 @@
|
||||||
|
+
|
||||||
|
import datetime
|
||||||
|
import asyncio
|
||||||
|
import aioschedule as schedule
|
||||||
|
@@ -15,8 +16,8 @@ async def announce():
|
||||||
|
except Exception:
|
||||||
|
message = "Ошибка обновления данных!"
|
||||||
|
if config.admin_users.split(',') is not None:
|
||||||
|
- for user_id in config.admin_users.split(','):
|
||||||
|
- if user_id in config.exclude:
|
||||||
|
+ for user_id in [int(i) for i in config.admin_users.split(',')]:
|
||||||
|
+ if user_id in [int(i) for i in config.exclude.split(",")]:
|
||||||
|
continue
|
||||||
|
await dp.bot.send_message(user_id, message)
|
||||||
|
|
||||||
|
diff --git a/utils/bot_commands.py b/utils/bot_commands.py
|
||||||
|
index b65bc62..3de6ed1 100644
|
||||||
|
--- a/utils/bot_commands.py
|
||||||
|
+++ b/utils/bot_commands.py
|
||||||
|
@@ -7,6 +7,6 @@ async def set_commands(dp):
|
||||||
|
types.BotCommand("help", "информация"),
|
||||||
|
types.BotCommand("link", "получить ссылку на файл"),
|
||||||
|
types.BotCommand('timetable', "Розклад"),
|
||||||
|
- types.BotCommand('feedback', "Звязок з адміністратором")
|
||||||
|
+ types.BotCommand('feedback', "Звязок з адміністратором"),
|
||||||
|
types.BotCommand("reload", "только для администрации"),
|
||||||
|
])
|
@ -5,5 +5,5 @@ peewee
|
|||||||
aiogram
|
aiogram
|
||||||
cryptography
|
cryptography
|
||||||
pymysqldb
|
pymysqldb
|
||||||
psycopg2
|
#psycopg2
|
||||||
aioschedule
|
aioschedule
|
||||||
|
@ -18,11 +18,17 @@ class Configure:
|
|||||||
for key, value in config.items(section):
|
for key, value in config.items(section):
|
||||||
self.data[section][key] = value
|
self.data[section][key] = value
|
||||||
|
|
||||||
|
config_folder = config.get("Docs_Settings", "Config_folder").rstrip("/")
|
||||||
|
self.data["documentid"] = config.get("Docs_Settings", 'Document_ID')
|
||||||
|
self.data["data_file"] = config_folder + "/" + config.get("Docs_Settings", "data_file")
|
||||||
|
self.data["credentials_file"] = config_folder + "/" + config.get("Docs_Settings", "credentials_file")
|
||||||
|
self.data["token_file"] = self.config_folder + "/" + self.data['Docs_Settings']['token_file']
|
||||||
|
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
|
if name in ["documentid", "data_file", "credentials_file", "token_file"]:
|
||||||
|
return self.data[name]
|
||||||
for key in self.data.keys():
|
for key in self.data.keys():
|
||||||
if name not in self.data[key]:
|
if name not in self.data[key]:
|
||||||
continue
|
continue
|
||||||
return self.data[key][name]
|
return self.data[key][name]
|
||||||
raise NameError("Config options not found!")
|
# raise NameError("Config options not found!")
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
{
|
{
|
||||||
"1, 121, 12c": "1.jpg",
|
"1, 121, 12c": "1.jpg",
|
||||||
"131, 13c, 141, 14c": "2.jpg",
|
"131, 13c, 141, 14c": "2.jpg",
|
||||||
"3, 411, 42c, 431": "3.jpg",
|
"3, 421, 42c, 431": "3.jpg",
|
||||||
"43c": "4.jpg",
|
"43c": "4.jpg",
|
||||||
"4, 521, 52c, 531": "5.jpg",
|
"4, 521, 52c, 531": "5.jpg",
|
||||||
"53c, 541, 54c": "6.jpg",
|
"53c": "6.jpg",
|
||||||
"2, 221, 22c, 231": "7.jpg",
|
"541, 54c": "7.jpg",
|
||||||
"23c, 241, 24c": "8.jpg",
|
"2, 221, 22c, 231": "8.jpg",
|
||||||
"411, 421, 431": ["9.jpg","10.jpg"]
|
"23c, 241, 24c": "9.jpg",
|
||||||
|
"411, 421, 431": ["10.jpg","11.jpg"]
|
||||||
}
|
}
|
||||||
|
@ -13,11 +13,11 @@ logging.basicConfig(
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
WEBAPP_HOST = config.bot("ip")
|
WEBAPP_HOST = config.ip
|
||||||
WEBAPP_PORT = config.bot("port")
|
WEBAPP_PORT = config.port
|
||||||
|
|
||||||
WEBHOOK_HOST = f'http://{WEBAPP_HOST}:{WEBAPP_PORT}'
|
WEBHOOK_HOST = f'http://{WEBAPP_HOST}:{WEBAPP_PORT}'
|
||||||
WEBHOOK_PATH = f'/bot{config.bot("token")}/'
|
WEBHOOK_PATH = f'/bot{config.token}/'
|
||||||
WEBHOOK_URL = f"{WEBHOOK_HOST}{WEBHOOK_PATH}"
|
WEBHOOK_URL = f"{WEBHOOK_HOST}{WEBHOOK_PATH}"
|
||||||
|
|
||||||
engeneerings_works = (
|
engeneerings_works = (
|
||||||
@ -29,7 +29,7 @@ parse_error = (
|
|||||||
"Бот приостановлен на неопределенный срок!\n"
|
"Бот приостановлен на неопределенный срок!\n"
|
||||||
"Что случилось?\n"
|
"Что случилось?\n"
|
||||||
"Администрация коледжа изменила формат файла с google docs на docx(Microsoft Office)\n"
|
"Администрация коледжа изменила формат файла с google docs на docx(Microsoft Office)\n"
|
||||||
"Замены вы можете посмотреть тут: https://docs.google.com/document/d/{}".format(config.documentid)
|
# "Замены вы можете посмотреть тут: https://docs.google.com/document/d/{}".format(config.documentid)
|
||||||
)
|
)
|
||||||
|
|
||||||
new_year = (
|
new_year = (
|
||||||
@ -45,10 +45,20 @@ the_end =(
|
|||||||
)
|
)
|
||||||
|
|
||||||
september_1 = ("Всіх з 1 вересням, всього найкращого!\n"
|
september_1 = ("Всіх з 1 вересням, всього найкращого!\n"
|
||||||
"Бот буде запущений чуть пізніше, "
|
"Бот працює в нормальному режимі!\n"
|
||||||
"коли заміни будуть публіковаться текстом")
|
"Приятного використання!")
|
||||||
|
|
||||||
send_msg = the_end
|
send_msg = the_end
|
||||||
|
idea = (
|
||||||
|
"Бажаєте предложити ідеї для функціонала бота, або для новошо боту?\n"
|
||||||
|
"У вас є така можливість, відправляйте свої ідеї в /feedback")
|
||||||
|
msg = idea
|
||||||
|
|
||||||
|
donate_add = ("На період канікул бот був вимкнутий\n"
|
||||||
|
"Ви можете зробити донат\n"
|
||||||
|
f"Оплптити онлайн: {config.payment_link}\n"
|
||||||
|
f"Переказ на карту: {config.card_number}\n")
|
||||||
|
msg = september_1
|
||||||
|
|
||||||
async def on_startup(dp):
|
async def on_startup(dp):
|
||||||
await bot.set_webhook(url=WEBHOOK_URL)
|
await bot.set_webhook(url=WEBHOOK_URL)
|
||||||
@ -63,7 +73,7 @@ async def asd(message):
|
|||||||
if user_id != 1083440854:
|
if user_id != 1083440854:
|
||||||
print(user_id)
|
print(user_id)
|
||||||
try:
|
try:
|
||||||
await bot.send_message(chat_id=user_id, text=send_msg)
|
await bot.send_message(chat_id=user_id, text=msg)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -77,11 +87,11 @@ async def start(message: types.Message):
|
|||||||
)
|
)
|
||||||
await bot.send_message(
|
await bot.send_message(
|
||||||
message.chat.id,
|
message.chat.id,
|
||||||
engeneerings_works
|
msg
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if config.bot("use_webhook").lower() in ['t', 'true', '1', 'yes', 'y']:
|
if config.use_webhook.lower() in ['t', 'true', '1', 'yes', 'y']:
|
||||||
executor.start_webhook(
|
executor.start_webhook(
|
||||||
dispatcher=dp,
|
dispatcher=dp,
|
||||||
webhook_path=WEBHOOK_PATH,
|
webhook_path=WEBHOOK_PATH,
|
||||||
|
@ -14,7 +14,7 @@ from database import register
|
|||||||
|
|
||||||
@dp.message_handler(ChatTypeFilter(['group', 'supergroup']), commands=['set'])
|
@dp.message_handler(ChatTypeFilter(['group', 'supergroup']), commands=['set'])
|
||||||
async def set_group(message: types.Message):
|
async def set_group(message: types.Message):
|
||||||
if (message.from_user.id not in [admin.user.id for admin in await bot.get_chat_administrators(message.chat.id)]) and (message.from_user.id not in config.admin_user):
|
if (message.from_user.id not in [admin.user.id for admin in await bot.get_chat_administrators(message.chat.id)]) and (str(message.from_user.id) not in config.admin_users.split(",")):
|
||||||
await message.answer("Вы не являетесь администратором чата!")
|
await message.answer("Вы не являетесь администратором чата!")
|
||||||
return
|
return
|
||||||
args = message.text.split()
|
args = message.text.split()
|
||||||
|
@ -14,10 +14,11 @@ async def feedback(message: types.Message, state):
|
|||||||
async def send_admins(message: types.Message, state):
|
async def send_admins(message: types.Message, state):
|
||||||
await message.copy_to(config.chat_id, reply_markup=await answer_kb(message.from_user.id))
|
await message.copy_to(config.chat_id, reply_markup=await answer_kb(message.from_user.id))
|
||||||
await message.answer("Дякую!")
|
await message.answer("Дякую!")
|
||||||
|
await state.finish()
|
||||||
|
|
||||||
|
|
||||||
@dp.message_handler(state="answer_support")
|
@dp.message_handler(state="answer_support")
|
||||||
async def send_answer(message: types.Message, state):
|
async def send_answer(message: types.Message, state):
|
||||||
data = await state.get_data()
|
data = await state.get_data()
|
||||||
await message.copy_to(data["u"])
|
|
||||||
await state.finish()
|
await state.finish()
|
||||||
|
await message.copy_to(data["u"])
|
||||||
|
150
parser/parser.py
150
parser/parser.py
@ -1,65 +1,121 @@
|
|||||||
import base64
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
import json
|
import json
|
||||||
import datetime
|
|
||||||
from datetime import datetime as dt
|
|
||||||
|
|
||||||
import requests
|
from googleapiclient.discovery import build
|
||||||
from bs4 import BeautifulSoup
|
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||||
|
from google.auth.transport.requests import Request
|
||||||
|
from google.oauth2.credentials import Credentials
|
||||||
|
|
||||||
try:
|
|
||||||
from load import config
|
from load import config
|
||||||
except ImportError: config = None
|
from .utils import Helper
|
||||||
try:
|
|
||||||
from .utils import *
|
# If modifying these scopes, delete the file token.json.
|
||||||
except ImportError:
|
SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
|
||||||
from utils import *
|
|
||||||
|
__all__ = ['docs_parse', 'get_about_replacements']
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
def docs_parse() -> None:
|
||||||
'user-agent':(
|
creds = None
|
||||||
"Mozilla/5.0 (Windows NT 10.0; WOW64) "
|
# The file token.json stores the user's access and refresh tokens, and is
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
# created automatically when the authorization flow completes for the first
|
||||||
"Chrome/62.0.3202.9 Safari/537.36"
|
# time.
|
||||||
|
if os.path.exists(config.token_file):
|
||||||
|
creds = Credentials.from_authorized_user_file(
|
||||||
|
config.token_file,
|
||||||
|
SCOPES
|
||||||
)
|
)
|
||||||
}
|
# If there are no (valid) credentials available, let the user log in.
|
||||||
|
if not creds or not creds.valid:
|
||||||
|
if creds and creds.expired and creds.refresh_token:
|
||||||
|
creds.refresh(Request())
|
||||||
|
else:
|
||||||
|
flow = InstalledAppFlow.from_client_secrets_file(
|
||||||
|
config.credentials_file, SCOPES)
|
||||||
|
creds = flow.run_local_server(port=0)
|
||||||
|
# Save the credentials for the next run
|
||||||
|
with open(config.token_file, 'w') as token:
|
||||||
|
token.write(creds.to_json())
|
||||||
|
|
||||||
|
service = build('docs', 'v1', credentials=creds)
|
||||||
|
|
||||||
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
|
# Retrieve the documents contents from the Docs service.
|
||||||
return dt.strftime(
|
document = service.documents().get(documentId=config.documentid).execute()
|
||||||
dt.now() +
|
if os.path.exists(config.data_file):
|
||||||
datetime.timedelta(days=days),
|
os.remove(config.data_file)
|
||||||
parse
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def docs_parse():
|
|
||||||
|
|
||||||
output = {
|
|
||||||
"data":{},
|
|
||||||
"another_teacher":None
|
|
||||||
}
|
|
||||||
|
|
||||||
page = requests.get(config.link, headers=headers)
|
|
||||||
page.encoding = 'utf-8'
|
|
||||||
|
|
||||||
soup = BeautifulSoup(page.text, "lxml")
|
|
||||||
|
|
||||||
# Это в идеале нужно переписать...
|
|
||||||
url = image_parser(soup)
|
|
||||||
with requests.get(url=url, allow_redirects=True, stream=True) as r:
|
|
||||||
output['image'] = True
|
|
||||||
output['date'] = 'невозможно получить!'
|
|
||||||
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
|
|
||||||
|
|
||||||
|
|
||||||
with open(config.data_file, 'w') as f:
|
with open(config.data_file, 'w') as f:
|
||||||
json.dump(output, f, ensure_ascii=False)
|
json.dump(document, f, ensure_ascii=False)
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
def get_about_replacements() -> dict:
|
def read_parse_data():
|
||||||
with open(config.data_file, 'r') as f:
|
with open(config.data_file, 'r') as f:
|
||||||
data = json.loads(f.read())
|
data = json.loads(f.read())
|
||||||
f.close()
|
f.close()
|
||||||
return data
|
return data
|
||||||
docs_parse()
|
|
||||||
|
|
||||||
|
def get_about_replacements() -> dict:
|
||||||
|
helper = Helper()
|
||||||
|
document = read_parse_data()
|
||||||
|
info = []
|
||||||
|
element = helper.get_table_element()
|
||||||
|
|
||||||
|
try:
|
||||||
|
count = document['body']["content"][element]["table"]["rows"]
|
||||||
|
except (IndexError, KeyError):
|
||||||
|
image, image_bytes = helper.find_image(document)
|
||||||
|
print(image)
|
||||||
|
if not image:
|
||||||
|
element = helper.find_with_table(document)
|
||||||
|
if element:
|
||||||
|
count = document['body']["content"][element]["table"]["rows"]
|
||||||
|
else:
|
||||||
|
info = helper.find_with_text(document)
|
||||||
|
|
||||||
|
if not image:
|
||||||
|
date = helper.get_date(document)
|
||||||
|
|
||||||
|
another_teacher = helper.teacher(document)
|
||||||
|
else:
|
||||||
|
date, another_teacher = False, None
|
||||||
|
|
||||||
|
if element and (not image):
|
||||||
|
for c in range(0, count):
|
||||||
|
more_replaces = (document['body']
|
||||||
|
["content"][element]["table"]
|
||||||
|
["tableRows"][c]["tableCells"][1]
|
||||||
|
["content"]
|
||||||
|
)
|
||||||
|
replaces = ''
|
||||||
|
for i in range(0, len(more_replaces)):
|
||||||
|
replaces += (document['body']["content"][element]["table"]
|
||||||
|
["tableRows"][c]["tableCells"][1]
|
||||||
|
["content"][i]["paragraph"]["elements"][0]
|
||||||
|
["textRun"]["content"].rstrip("\n"))
|
||||||
|
|
||||||
|
info.append(
|
||||||
|
(
|
||||||
|
document['body']["content"][element]["table"]
|
||||||
|
["tableRows"][c]["tableCells"][0]
|
||||||
|
["content"][0]["paragraph"]["elements"][0]
|
||||||
|
["textRun"]["content"].rstrip("\n"),
|
||||||
|
replaces
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if image:
|
||||||
|
return {
|
||||||
|
"image": image,
|
||||||
|
'date': date if type(date) != type(False) else "Error" ,
|
||||||
|
'data': {"all": image_bytes},
|
||||||
|
'another_teacher': another_teacher,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
'date': date if type(date) != type(False) else "Error" ,
|
||||||
|
'data': dict(info),
|
||||||
|
'another_teacher': another_teacher,
|
||||||
|
}
|
||||||
|
230
parser/utils.py
230
parser/utils.py
@ -1,34 +1,210 @@
|
|||||||
from bs4 import BeautifulSoup
|
import os
|
||||||
from typing import Any
|
import datetime
|
||||||
|
from datetime import datetime as dt
|
||||||
|
|
||||||
def table_parser(soup: BeautifulSoup, output):
|
import requests
|
||||||
#Date parser
|
|
||||||
date = (soup.find("main").findAll('span', style="color:black"))[1]
|
from load import config
|
||||||
output["date"] = date.text.replace(u'\xa0', u'')
|
|
||||||
|
|
||||||
|
|
||||||
#Replaces parser
|
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
|
||||||
replaces = soup.findAll('tr')
|
return dt.strftime(
|
||||||
for data in replaces:
|
dt.now() +
|
||||||
|
datetime.timedelta(days=days),
|
||||||
text = (
|
parse
|
||||||
data.find("td", valign="top")
|
|
||||||
.find("span", style="color:black")
|
|
||||||
.text.replace(u'\xa0', u'')
|
|
||||||
)
|
)
|
||||||
group = (
|
|
||||||
data.find("span", style="color:black")
|
|
||||||
.text.replace(" ", "").replace(u'\xa0', u''))
|
|
||||||
output["data"][group] = text
|
|
||||||
|
|
||||||
return output
|
'''
|
||||||
|
self.months = {
|
||||||
|
1: "січень",
|
||||||
|
2: "лютий",
|
||||||
|
3: "березень",
|
||||||
|
4: "квітень",
|
||||||
|
5: "травень",
|
||||||
|
6: "червень",
|
||||||
|
7: "липень",
|
||||||
|
8: "серпень",
|
||||||
|
9: "вересень",
|
||||||
|
10: "жовтень",
|
||||||
|
11: "листопад",
|
||||||
|
12: "грудень"
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'user-agent':(
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; WOW64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/62.0.3202.9 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
class Helper():
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.date_now = date_parser_helper(0)
|
||||||
|
self.date_next = date_parser_helper(1)
|
||||||
|
self.weekend_pass = date_parser_helper(2)
|
||||||
|
self.two_day_pass = date_parser_helper(3)
|
||||||
|
|
||||||
|
self.black_list = [
|
||||||
|
'черговий викладач',
|
||||||
|
self.date_now,
|
||||||
|
self.date_next,
|
||||||
|
self.weekend_pass,
|
||||||
|
self.two_day_pass
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def find_with_table(document):
|
||||||
|
c_element = 2
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
document['body']["content"][c_element]["table"]["rows"]
|
||||||
|
break
|
||||||
|
except KeyError:
|
||||||
|
c_element += 1
|
||||||
|
if c_element > 15:
|
||||||
|
return False
|
||||||
|
except IndexError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
with open("{}/table_element.txt".format(config.config_folder), 'w') as f:
|
||||||
|
f.write(str(c_element))
|
||||||
|
f.close()
|
||||||
|
return c_element
|
||||||
|
|
||||||
|
def find_with_text(self, document):
|
||||||
|
format_charset = '-'
|
||||||
|
alternative_format_charset = "\t"
|
||||||
|
element = 4
|
||||||
|
data = []
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
while element < 15:
|
||||||
|
doc = (
|
||||||
|
document['body']["content"][element]
|
||||||
|
["paragraph"]["elements"][0]["textRun"]["content"]
|
||||||
|
).rstrip("\n").replace("–", "-", 1)
|
||||||
|
if (
|
||||||
|
(
|
||||||
|
("-" in doc)
|
||||||
|
#and
|
||||||
|
#("\t" not in doc)
|
||||||
|
)
|
||||||
|
and
|
||||||
|
([p not in doc.lower() for p in self.black_list][0])
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
group, text = doc.split(format_charset)
|
||||||
|
except ValueError:
|
||||||
|
if element > 6:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
group, text = doc.split(alternative_format_charset)
|
||||||
|
except ValueError:
|
||||||
|
if element > 6:
|
||||||
|
break
|
||||||
|
if text != '':
|
||||||
|
data.append(
|
||||||
|
(group.strip(" "), text.lstrip(" ").replace("\t", ""))
|
||||||
|
)
|
||||||
|
element += 1
|
||||||
|
return data
|
||||||
|
|
||||||
|
def get_date(self, document):
|
||||||
|
date_element = 1
|
||||||
|
while date_element < 16:
|
||||||
|
try:
|
||||||
|
date = (
|
||||||
|
document['body']["content"][date_element]
|
||||||
|
["paragraph"]["elements"][0]["textRun"]["content"]
|
||||||
|
.rstrip(" \n"))
|
||||||
|
except:
|
||||||
|
date_element += 1
|
||||||
|
if (
|
||||||
|
(
|
||||||
|
(
|
||||||
|
self.date_now in date.lower()
|
||||||
|
.lstrip("заміни").lstrip("на").replace(" ", "")
|
||||||
|
)
|
||||||
|
or
|
||||||
|
(
|
||||||
|
self.date_next in date.lower()
|
||||||
|
.lstrip("заміни").lstrip("на").replace(" ", "")
|
||||||
|
)
|
||||||
|
or
|
||||||
|
(
|
||||||
|
self.weekend_pass in date.lower()
|
||||||
|
.lstrip("заміни").lstrip("на").replace(" ", "")
|
||||||
|
)
|
||||||
|
or
|
||||||
|
(
|
||||||
|
self.two_day_pass in date.lower()
|
||||||
|
.lstrip("заміни").lstrip("на").replace(" ", "")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or
|
||||||
|
(
|
||||||
|
"заміни на" in date.lower()
|
||||||
|
)
|
||||||
|
):
|
||||||
|
return date
|
||||||
|
else:
|
||||||
|
date_element += 1
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_table_element():
|
||||||
|
if os.path.exists(f"{config.config_folder}/table_element.txt"):
|
||||||
|
element = int(
|
||||||
|
open(
|
||||||
|
f"{config.config_folder}/table_element.txt",
|
||||||
|
'r'
|
||||||
|
)
|
||||||
|
.read()
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
element = 6
|
||||||
|
return element
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def teacher(document):
|
||||||
|
element = 1
|
||||||
|
while element < 6:
|
||||||
|
if "paragraph" in document['body']["content"][element]:
|
||||||
|
length_element = (len(document['body']["content"][element]
|
||||||
|
["paragraph"]["elements"]))
|
||||||
|
|
||||||
|
doc = (
|
||||||
|
document['body']["content"][element]["paragraph"]["elements"]
|
||||||
|
[0]["textRun"]["content"].rstrip("\n")
|
||||||
|
)
|
||||||
|
if 'черговий викладач' in doc.lower().replace("–", ""):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
elif length_element > 1:
|
||||||
|
for p in range(length_element):
|
||||||
|
doc = (
|
||||||
|
document['body']["content"][element]
|
||||||
|
["paragraph"]["elements"]
|
||||||
|
[p]["textRun"]["content"].rstrip("\n")
|
||||||
|
)
|
||||||
|
if 'черговий викладач' in doc.lower().replace("–", ""):
|
||||||
|
return doc
|
||||||
|
|
||||||
|
element += 1
|
||||||
|
|
||||||
|
|
||||||
def image_parser(soup: BeautifulSoup):
|
|
||||||
image: Any
|
@classmethod
|
||||||
extension = ('png', 'jpg')
|
def find_image(cls, document):
|
||||||
main = soup.find("main")
|
for i in document['body']["content"]:
|
||||||
for ext in extension:
|
if ("paragraph" in i) and ("elements" in i["paragraph"]):
|
||||||
image = main.select(f'img[src$=".{ext}"]')
|
if "inlineObjectElement" in i["paragraph"]["elements"][0]:
|
||||||
if image:
|
import base64
|
||||||
return image[0]['src']
|
return True, base64.b64encode(open("photo.png", 'rb').read()).decode('utf-8')
|
||||||
|
return False, None
|
||||||
|
|
||||||
|
9
setup_google_docs_api.py
Normal file
9
setup_google_docs_api.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
'''
|
||||||
|
Don`t move this file!
|
||||||
|
'''
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
from parser import docs_parse
|
||||||
|
docs_parse()
|
2
website-parser/__init__.py
Normal file
2
website-parser/__init__.py
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
from .parser import get_about_replacements, docs_parse
|
||||||
|
__all__ = ['get_about_replacements', 'docs_parse']
|
68
website-parser/parser.py
Normal file
68
website-parser/parser.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
from datetime import datetime as dt
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
try:
|
||||||
|
from load import config
|
||||||
|
except ImportError: config = None
|
||||||
|
try:
|
||||||
|
from .utils import *
|
||||||
|
except ImportError:
|
||||||
|
from utils import *
|
||||||
|
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'user-agent':(
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; WOW64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/62.0.3202.9 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
|
||||||
|
return dt.strftime(
|
||||||
|
dt.now() +
|
||||||
|
datetime.timedelta(days=days),
|
||||||
|
parse
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def docs_parse():
|
||||||
|
|
||||||
|
output = {
|
||||||
|
"data":{},
|
||||||
|
"another_teacher":None
|
||||||
|
}
|
||||||
|
|
||||||
|
page = requests.get(config.link, headers=headers)
|
||||||
|
page.encoding = 'utf-8'
|
||||||
|
|
||||||
|
soup = BeautifulSoup(page.text, "lxml")
|
||||||
|
|
||||||
|
# Это в идеале нужно переписать...
|
||||||
|
url = image_parser(soup)
|
||||||
|
with requests.get(url=url, allow_redirects=True, stream=True) as r:
|
||||||
|
output['image'] = True
|
||||||
|
output['date'] = 'невозможно получить!'
|
||||||
|
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
|
||||||
|
|
||||||
|
|
||||||
|
with open(config.data_file, 'w') as f:
|
||||||
|
json.dump(output, f, ensure_ascii=False)
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_about_replacements() -> dict:
|
||||||
|
with open(config.data_file, 'r') as f:
|
||||||
|
data = json.loads(f.read())
|
||||||
|
f.close()
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
docs_parse()
|
34
website-parser/utils.py
Normal file
34
website-parser/utils.py
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
def table_parser(soup: BeautifulSoup, output):
|
||||||
|
#Date parser
|
||||||
|
date = (soup.find("main").findAll('span', style="color:black"))[1]
|
||||||
|
output["date"] = date.text.replace(u'\xa0', u'')
|
||||||
|
|
||||||
|
|
||||||
|
#Replaces parser
|
||||||
|
replaces = soup.findAll('tr')
|
||||||
|
for data in replaces:
|
||||||
|
|
||||||
|
text = (
|
||||||
|
data.find("td", valign="top")
|
||||||
|
.find("span", style="color:black")
|
||||||
|
.text.replace(u'\xa0', u'')
|
||||||
|
)
|
||||||
|
group = (
|
||||||
|
data.find("span", style="color:black")
|
||||||
|
.text.replace(" ", "").replace(u'\xa0', u''))
|
||||||
|
output["data"][group] = text
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def image_parser(soup: BeautifulSoup):
|
||||||
|
image: Any
|
||||||
|
extension = ('png', 'jpg')
|
||||||
|
main = soup.find("main")
|
||||||
|
for ext in extension:
|
||||||
|
image = main.select(f'img[src$=".{ext}"]')
|
||||||
|
if image:
|
||||||
|
return image[0]['src']
|
Loading…
Reference in New Issue
Block a user