ufes-mestrado-projetopesqui.../docRefNetCreator/documents/pdfreader.py

52 lines
1.9 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import subprocess
from pathlib import Path
from typing import List
from typing import Union
from .document import Document
try:
version = subprocess.run(
['pdftotext', '-v'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE
)
if version.returncode != 0:
raise ImportError("PdfReader needs that 'pdftotext' properly informs its version.")
version = tuple(map(int, filter(str.isdigit, version.stderr.decode('utf-8', 'ignore').splitlines()[0].split(' ')[-1].split('.'))))
if version < (0, 41, 0):
raise ImportError("Your 'pdftotext' is outdated. Its minimum version is 0.41.0.")
except FileNotFoundError:
raise ImportError("PdfReader needs 'pdftotext' command in your PATH")
class PdfReader(Document):
@classmethod
def _opens(cls):
return ['pdf']
def __init__(self, resource: Union[str, bytes, Path]):
if isinstance(resource, Path):
return self.__init__(resource.read_bytes())
if isinstance(resource, str):
return self.__init__(Path(resource).read_bytes())
if hasattr(resource, 'read'):
return self.__init__(resource.read())
if not isinstance(resource, bytes):
raise ValueError("Constructor argument is not bytes or anything known to be converted to bytes")
super().__init__()
self._set_document_pages(self.__convert_pdf_to_text(resource))
def __convert_pdf_to_text(self, resource) -> List[str]:
proc = subprocess.run(
['pdftotext', '-layout', '-', '-'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
input=resource
)
if proc.returncode != 0:
raise ValueError("Constructed with an unhealthy PDF")
return list(map(lambda a: bytes.decode(a, 'utf-8', 'replace'), proc.stdout.split(b'\x0c')))