ufes-mestrado-projetopesqui.../docRefNetCreator/documents/document.py

220 lines
7.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
from os import linesep as eol
from pathlib import Path
import json
import re
toc_line_regex = re.compile(r'.+(\s*?[\.]){6,}\s*\d+')
toc_line_parsing_regex = re.compile(r'^\s*?((?:(?:[^\s\.]+?)(?:[\s\.])?)+?)[-\s]+(.*?)(?:\s*\.)+\s+([\divxdcm]+)$', re.I | re.M | re.S)
def get_possible_header_from_text(ll):
if len(ll) <= 0:
return "", ""
return ll[0].strip(), ll[1:]
def get_possible_header(ll):
return get_possible_header_from_text(ll)[0]
def get_possible_footer_from_text(ll):
if len(ll) <= 0:
return "", ""
return ll[-1].strip(), ll[:-1]
def get_possible_footer(ll):
return get_possible_footer_from_text(ll)[0]
def get_middle_sample(dp):
return dp[1*(len(dp)//4): 3*(len(dp)//4)]
def find_header_footer_ignorable_part(samples, not_found=""):
if len(samples) <= 0:
return not_found
intersection = set(samples[0].split())
for sample in samples:
intersection.intersection_update(sample.split())
if len(intersection) == 0:
return not_found
part_ranges = list(zip(*[(i[0], i[0]+i[1]) for i in [(samples[0].find(part), len(part)) for part in intersection]]))
ignoreable = samples[0][min(part_ranges[0]):max(part_ranges[1])]
for sample in samples:
if ignoreable not in sample:
return not_found
return ignoreable
def strip_empty_lines(page):
start = 0
stop = len(page)
for i, line in enumerate(page):
if line.strip() == '':
start = i
else:
break
for i, line in reversed(list(enumerate(page))):
if line.strip() == '':
stop = i+1
else:
break
return page[start:stop]
def cleanup_page_from_header_and_footer(pages, ignorable_header, ignorable_footer):
cleaned = list()
for brute_page in pages:
useful_header = ""
useful_footer = ""
page = brute_page
if ignorable_header != "":
pntp, ptp = get_possible_header_from_text(page)
if ignorable_header in pntp:
useful_header = pntp.replace(ignorable_footer, "").strip()
page = ptp
if ignorable_footer != "":
pntp, ptp = get_possible_footer_from_text(page)
if ignorable_footer in pntp:
useful_footer = pntp.replace(ignorable_footer, "").strip()
page = ptp
page = strip_empty_lines(page)
cleaned.append((page, useful_header, useful_footer))
return cleaned
def parse_toc(pages):
toc = list()
return toc # No one will miss the table of contents that never returned because instruction pointer got stuck in an infinite loop regex
for page in pages:
page_content = '\n'.join(page[0]).split('\n'*3)[0]
while page_content.startswith(' '*20):
page_content = '\n'.join(page_content.splitlines()[1:])
for match in toc_line_parsing_regex.finditer(page_content):
toc.append((
match[1].strip().strip('.'),
' '.join(match[2].replace('\n', ' ').split()).strip(),
match[3].strip()
))
return toc
def get_textual_part_default(pages):
return (pages, list())
def get_textual_part_handling_toc(pages, toc_page):
current_page = toc_page
while(toc_line_regex.search('\n'.join(pages[current_page][0]))):
current_page += 1
if current_page >= len(pages):
return None
toc_pages = pages[toc_page: current_page]
textual_part = pages[current_page:]
return (textual_part, parse_toc(toc_pages))
def get_textual_part_from_header_and_footer(pages):
return (list(filter(lambda a: (a[1].strip().isdigit() or a[2].strip().isdigit()), pages)), list())
def get_textual_part(pages, has_header, has_footer, toc_names):
for i, page in enumerate(pages):
possible_title = get_possible_header(page[0])
if possible_title.strip().lower() in toc_names:
option = get_textual_part_handling_toc(pages, i)
if option is None:
break
else:
return option
if has_header or has_footer:
return get_textual_part_from_header_and_footer(pages)
return get_textual_part_default(pages)
def fix_paragraphs(lines):
fixed = list()
buff = ''
for line in lines+[None]:
if line is None:
if len(buff) <= 0:
fixed.append(buff+'.' if not (buff.endswith('.') or buff.endswith(';') or buff.endswith(':')) else buff)
buff = ''
elif len(line.strip()) <= 0:
if len(buff) > 0:
fixed.append(buff+'.' if not (buff.endswith('.') or buff.endswith(';') or buff.endswith(':')) else buff)
buff = ''
else:
ls = line.strip()
if buff.endswith('-'):
buff = buff+ls
elif len(buff) >= 2 and buff[-2].isdigit() and buff[-1] == ')':
buff += ' '+ls
elif len(ls) >= 2 and (ls[0].islower() or (ls[0].isupper() and ls[1].isupper())) and ls[0].isalpha() and (ls[1].isalpha() or ls[1].isspace()):
buff += ' '+ls
else:
if len(buff) > 0:
fixed.append(buff+'.' if not (buff.endswith('.') or buff.endswith(';') or buff.endswith(':')) else buff)
buff = ls
return [' '.join(ln.split()) for ln in fixed]
class Document(object):
@classmethod
def _opens(cls):
return []
def _set_document_pages(self, pages):
self._document_pages = pages
def __init__(self):
# self.has_cover = True
# self.has_cover_sheet = True
self.detect_number_on_first_page = True
# self.has_pretextual_elements = True
# self.entities_to_find = ['table', 'figure', 'image', 'chart']
self.toc_names = ['contents', 'index', 'table of contents']
# self.has_header = False
# self.has_footer = True
# self.has_tables = True
def parse(self, cst_eol=eol):
pages = list(map(str.splitlines, self._document_pages))
sample_pages = get_middle_sample(pages)
ignorable_header = find_header_footer_ignorable_part(list(map(get_possible_header, sample_pages)))
ignorable_footer = find_header_footer_ignorable_part(list(map(get_possible_footer, sample_pages)))
del sample_pages
cleaned_pages = list(
filter(lambda a: len(a[0]) > 0,
map(lambda a: (a[1][0], a[1][1], a[1][2], a[0]),
enumerate(cleanup_page_from_header_and_footer(pages, ignorable_header, ignorable_footer))
)))
textual_part = get_textual_part(cleaned_pages, ignorable_header != "", ignorable_footer != "", self.toc_names)
del ignorable_header
del ignorable_footer
del cleaned_pages
joined_paragraphs = fix_paragraphs(eol.join(map(lambda a: eol.join(a[0]), textual_part[0])).splitlines())
if cst_eol is None:
return joined_paragraphs
else:
return cst_eol.join(joined_paragraphs)
# for part in textual_part:
# print(part)
# print(cleaned_page_contents)
# print(sample_pages)
def parsed_from_cache(self, cachekey: str, cst_eol: str = eol):
cached = None
cached_disk = Path('plaincache', cachekey)
if cached_disk.exists():
cached = json.loads(cached_disk.read_text())
else:
cached = self.parse(None)
cached_disk.parent.mkdir(parents=True, exist_ok=True)
cached_disk.write_text(json.dumps(cached))
return cst_eol.join(cached)