Update parser

This commit is contained in:
tema 2023-09-04 20:55:37 +03:00
parent 478460525e
commit 8754cf841e
Signed by: tema
GPG Key ID: 21FDB6D162488F6F
2 changed files with 38 additions and 5 deletions

View File

@ -67,11 +67,13 @@ def get_about_replacements() -> dict:
try: try:
count = document['body']["content"][element]["table"]["rows"] count = document['body']["content"][element]["table"]["rows"]
except (IndexError, KeyError): except (IndexError, KeyError):
element = helper.find_with_table(document) image, image_bytes = helper.find_image(document)
if element: if not image:
count = document['body']["content"][element]["table"]["rows"] element = helper.find_with_table(document)
else: if element:
info = helper.find_with_text(document) count = document['body']["content"][element]["table"]["rows"]
else:
info = helper.find_with_text(document)
date = helper.get_date(document) date = helper.get_date(document)
@ -101,6 +103,13 @@ def get_about_replacements() -> dict:
) )
) )
if image:
return {
"image": image,
'date': date if type(date) != type(False) else "Error" ,
'data': {"all": image_bytes},
'another_teacher': another_teacher,
}
return { return {
'date': date if type(date) != type(False) else "Error" , 'date': date if type(date) != type(False) else "Error" ,
'data': dict(info), 'data': dict(info),

View File

@ -2,6 +2,8 @@ import os
import datetime import datetime
from datetime import datetime as dt from datetime import datetime as dt
import requests
from load import config from load import config
@ -186,3 +188,25 @@ class Helper():
return doc return doc
element += 1 element += 1
@classmethod
def get_link_and_download(cls, id_doc, document):
if "inlineObjects" in document:
if id_doc in document['inlineObjects']:
link = (document
['inlineObjects'][id_doc]['inlineObjectProperties']
['embeddedObject']['imageProperties']['contentUri'])
r = requests.get(link, stream=True)
return r.raw
@classmethod
def find_image(cls, document):
for i in document['body']["content"]:
if ("paragraph" in i) and ("elements" in i["paragraph"]):
if "inlineObjectElement" in i["paragraph"]["elements"]:
return True, cls.get_link_and_download(
i["paragraph"]["elements"]
['inlineObjectElement']['inlineObjectId'], document)
return False, None