first commit

This commit is contained in:
2023-03-03 23:40:08 +02:00
commit 01f19ebb1d
8 changed files with 317 additions and 0 deletions

2
parser/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
from .parser import get_about_replacements, docs_parse
__all__ = ['get_about_replacements', 'docs_parse']

67
parser/parser.py Normal file
View File

@@ -0,0 +1,67 @@
import base64
import json
import datetime
from datetime import datetime as dt
import requests
from bs4 import BeautifulSoup
try:
from load import config
except ImportError: config = None
try:
from .utils import *
except ImportError:
from utils import *
headers = {
'user-agent':(
"Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/62.0.3202.9 Safari/537.36"
)
}
def date_parser_helper(days:int, parse:str="%d.%m.20%y"):
return dt.strftime(
dt.now() +
datetime.timedelta(days=days),
parse
)
def docs_parse():
output = {
"data":{},
"another_teacher":None
}
page = requests.get(config.parser.link, headers=headers)
page.encoding = 'utf-8'
soup = BeautifulSoup(page.text, "lxml")
# Это в идеале нужно переписать...
url = image_parser(soup)
with requests.get(url=url, allow_redirects=True, stream=True) as r:
output['image'] = True
output['date'] = 'невозможно получить!'
output['data']['all'] = base64.b64encode(r.content).decode('utf-8')
with open(config.data_file, 'w') as f:
json.dump(output, f, ensure_ascii=False)
f.close()
def get_about_replacements() -> dict:
with open(config.data_file, 'r') as f:
data = json.loads(f.read())
f.close()
return data
if __name__ == "__main__":
docs_parse()

34
parser/utils.py Normal file
View File

@@ -0,0 +1,34 @@
from bs4 import BeautifulSoup
from typing import Any
def table_parser(soup: BeautifulSoup, output):
#Date parser
date = (soup.find("main").findAll('span', style="color:black"))[1]
output["date"] = date.text.replace(u'\xa0', u'')
#Replaces parser
replaces = soup.findAll('tr')
for data in replaces:
text = (
data.find("td", valign="top")
.find("span", style="color:black")
.text.replace(u'\xa0', u'')
)
group = (
data.find("span", style="color:black")
.text.replace(" ", "").replace(u'\xa0', u''))
output["data"][group] = text
return output
def image_parser(soup: BeautifulSoup):
image: Any
extension = ('png', 'jpg')
main = soup.find("main")
for ext in extension:
image = main.select(f'img[src$=".{ext}"]')
if image:
return image[0]['src']