Strange use of Lambda arrow
edmondo.giovannozzi at gmail.com
edmondo.giovannozzi at gmail.com
Sat Jun 6 06:52:51 EDT 2020
Have a look at:
https://docs.python.org/3/library/typing.html
Il giorno venerdì 5 giugno 2020 18:35:10 UTC+2, Agnese Camellini ha scritto:
> Hello to everyone, lately i building up an open source project, with some
> collaborator, but one of them cannot contribute any more. He is a solution
> architect so he is very skilled (much more than me!). I am now analysing
> his code to finish the job but i don't get this use of the lambda arrow,
> it's like he is deplaring the returned tipe in the function signature (as
> you would do in Java). I have never seen something like this in python..
>
> Can someone please explain to me this usage (the part regarding the
> question is highlighted in yellow):
>
> @classmethod
> def extract_document_data(cls, file_path : str) -> DocumentData:
> """
> Entry point of the module, it extracts the data from the document
> whose path is passed as input.
> The extraction strategy is automatically chosen based on the MIME
> type
> of the file.
>
> @type file_path: str
> @param file_path: The path of the document to be parsed.
> @rtype: DocumentData
> @returns: An object containing the data of the parsed document.
> """
>
> mime = magic.Magic(mime=True)
> mime_type = mime.from_file(file_path)
> document_type = DocumentType.get_instance(mime_type)
> strategy = cls.strategies[document_type]
> return strategy.extract_document_data(file_path)
>
>
> To be more verbose, this is the whole script:
>
> from enum import Enum
> import json
> import magic
>
> import docx
> from pdfminer.converter import PDFPageAggregator
> from pdfminer.layout import LAParams, LTContainer, LTTextContainer
> from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
> from pdfminer.pdfinterp import PDFPageInterpreter
> from pdfminer.pdfinterp import PDFResourceManager
> from pdfminer.pdfpage import PDFPage
> from pdfminer.pdfparser import PDFParser
>
>
> class DocumentType(Enum):
> """
> Defines the handled document types.
> Each value is associated to a MIME type.
> """
>
> def __init__(self, mime_type):
> self.mime_type = mime_type
>
> @classmethod
> def get_instance(cls, mime_type : str):
> values = [e for e in cls]
> for value in values:
> if value.mime_type == mime_type:
> return value
> raise MimeNotValidError(mime_type)
>
> PDF = 'application/pdf'
> DOCX =
> 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
>
>
> class MimeNotValidError(Exception):
> """
> Exception to be raised when a not valid MIME type is processed.
> """
>
> pass
>
>
> class DocumentData:
> """
> Wrapper for the extracted document data (TOC and contents).
> """
>
> def __init__(self, toc : list = [], pages : list = [], document_text :
> str = None):
> self.toc = toc
> self.pages = pages
> if document_text is not None:
> self.document_text = document_text
> else:
> self.document_text = ' '.join([page.replace('\n', ' ') for page
> in pages])
>
> def toc_as_json(self) -> str:
> return json.dumps(self.toc)
>
>
> class ExtractionStrategy:
> """
> Base class for the extraction strategies.
> """
>
> @staticmethod
> def extract_document_data(file_path : str) -> DocumentData:
> pass
>
>
> class DOCXExtractionStrategy(ExtractionStrategy):
> """
> It implements the TOC and contents extraction from a DOCX document.
> """
>
> @staticmethod
> def extract_document_data(file_path : str) -> DocumentData:
> document = docx.Document(file_path)
> body_elements = document._body._body
> # Selecting only the <w:t> elements from DOCX XML,
> # as they're the only to contain some text.
> text_elems = body_elements.xpath('.//w:t')
> return DocumentData(document_text = ' '.join([elem.text for elem in
> text_elems]))
>
>
> class PDFExtractionStrategy(ExtractionStrategy):
> """
> It implements the TOC and contents extraction from a PDF document.
> """
>
> @staticmethod
> def parse_toc(doc : PDFDocument) -> list:
> raw_toc = []
> try:
> outlines = doc.get_outlines()
> for (level, title, dest, a, se) in outlines:
> raw_toc.append((level, title))
> except PDFNoOutlines:
> pass
> return PDFExtractionStrategy.build_toc_tree(raw_toc)
>
> @staticmethod
> def build_toc_tree(items : list) -> list:
> """
> Builds the TOC tree from a list of TOC items.
>
> @type items: list
> @param items: The TOC items.
> Each item must have the following format: (<item depth>, <item
> description>).
> E.g: [(1, 'Contents'), (2, 'Chapter 1'), (2, 'Chapter 2')]
> @rtype: list
> @returns: The TOC tree. The tree hasn't a root element, therefore it
> actually is a list.
> """
>
> toc = []
> if items is None or len(items) == 0:
> return toc
> current_toc_level = toc
> # Using an explicit stack containing the lists corresponding to
> # the various levels of the TOC, to simulate the recursive building
> # of the TOC tree in a more efficient way
> toc_levels_stack = []
> toc_levels_stack.append(current_toc_level)
>
> # Each TOC item can be inserted into the current TOC level as
> # string (just the item description) or as dict, where the key is
> # the item description and the value is a list containing the
> # children TOC items.
> # To correctly determine how to insert the current item into
> # the current level, a kind of look-ahead is needed, that is
> # the depth of the next item has to be considered.
>
> # Initializing the variables related to the previous item.
> prev_item_depth, prev_item_desc = items[0]
> # Adding a fake final item in order to handle all the TOC items
> # inside the cycle.
> items.append((-1, ''))
>
> for i in range(1, len(items)):
> # In fact each iteration handles the item of the previous
> # one, using the current item to determine how to insert
> # the previous item into the current TOC level,
> # as explained before.
> curr_item = items[i]
> curr_item_depth = curr_item[0]
>
> if curr_item_depth == prev_item_depth:
> # The depth of the current item is the same
> # as the previous one.
> # Inserting the previous item into the current TOC level
> # as string.
> current_toc_level.append(prev_item_desc)
> elif curr_item_depth == prev_item_depth + 1:
> # The depth of the current item is increased by 1 compared
> to
> # the previous one.
> # Inserting the previous item into the current TOC level
> # as dict.
> prev_item_dict = { prev_item_desc : [] }
> current_toc_level.append(prev_item_dict)
> # Updating the current TOC level with the newly created one
> # which contains the children of the previous item.
> current_toc_level = prev_item_dict[prev_item_desc]
> toc_levels_stack.append(current_toc_level)
> elif curr_item_depth < prev_item_depth:
> # The depth of the current item is lesser than
> # the previous one.
> # Inserting the previous item into the current TOC level
> # as string.
> current_toc_level.append(prev_item_desc)
> if i < len(items)-1:
> # Executing these steps for all the items except the
> last one
> depth_diff = prev_item_depth - curr_item_depth
> # Removing from the stack as many TOC levels as the
> difference
> # between the depth of the previous item and the depth
> of the
> # current one.
> for i in range(0, depth_diff):
> toc_levels_stack.pop()
> # Updating the current TOC level with the one contained
> in
> # the head of the stack.
> current_toc_level = toc_levels_stack[-1]
> # Updating the previous item with the current one
> prev_item_depth, prev_item_desc = curr_item
>
> return toc
>
> @staticmethod
> def from_bytestring(s) -> str:
> """
> If the input string is a byte-string, converts it to a string using
> UTF-8 as encoding.
>
> @param s: A string or a byte-string.
> @rtype: str
> @returns: The potentially converted string.
> """
>
> if s:
> if isinstance(s, str):
> return s
> else:
> return s.encode('utf-8')
>
> @staticmethod
> def parse_layout_nodes(container : LTContainer) -> str:
> """
> Recursively extracts the text from all the nodes contained in the
> input PDF layout tree/sub-tree.
>
> @type container: LTContainer
> @param container: The PDF layout tree/sub-tree from which to
> extract the text.
> @rtype: str
> @returns: A string containing the extracted text.
> """
>
> text_content = []
>
> # The iterator returns the children nodes.
> for node in container:
> if isinstance(node, LTTextContainer):
> # Only nodes of type LTTextContainer contain text.
>
> text_content.append(PDFExtractionStrategy.from_bytestring(node.get_text()))
> elif isinstance(node, LTContainer):
> # Recursively calling the method on the current node, which
> is a container itself.
>
> text_content.append(PDFExtractionStrategy.parse_layout_nodes(node))
> else:
> # Ignoring all the other node types.
> pass
>
> # Joining all the extracted text chunks with a new line character.
> return "\n".join(text_content)
>
> @staticmethod
> def parse_pages(doc : PDFDocument) -> list:
> rsrcmgr = PDFResourceManager()
> laparams = LAParams()
> device = PDFPageAggregator(rsrcmgr, laparams=laparams)
> interpreter = PDFPageInterpreter(rsrcmgr, device)
>
> text_content = []
> for i, page in enumerate(PDFPage.create_pages(doc)):
> interpreter.process_page(page)
> layout = device.get_result()
> # Extracts the text from all the nodes of the PDF layout tree
> of each page
>
> text_content.append(PDFExtractionStrategy.parse_layout_nodes(layout))
>
> return text_content
>
> @staticmethod
> def parse_pdf(file_path : str) -> (list, list):
> toc = []
> pages = []
> try:
> fp = open(file_path, 'rb')
> parser = PDFParser(fp)
> doc = PDFDocument(parser)
> parser.set_document(doc)
>
> if doc.is_extractable:
> toc = PDFExtractionStrategy.parse_toc(doc)
> pages = PDFExtractionStrategy.parse_pages(doc)
>
> fp.close()
> except IOError:
> pass
> return (toc, pages)
>
> @staticmethod
> def extract_document_data(file_path : str) -> DocumentData:
> toc, pages = PDFExtractionStrategy.parse_pdf(file_path)
> return DocumentData(toc, pages = pages)
>
>
> class DocumentDataExtractor:
> """
> Main class of the module.
> It's responsible for actually executing the text extraction.
> The output is constituted by the following items:
> -table of contents (TOC);
> -pages contents.
> """
>
> # Dictionary containing the extraction strategies for the different
> # document types, indexed by the corresponding DocumentType enum values.
> strategies = {
> DocumentType.DOCX : DOCXExtractionStrategy(),
> DocumentType.PDF : PDFExtractionStrategy()
> }
>
> @classmethod
> def extract_document_data(cls, file_path : str) -> DocumentData:
> """
> Entry point of the module, it extracts the data from the document
> whose path is passed as input.
> The extraction strategy is automatically chosen based on the MIME
> type
> of the file.
>
> @type file_path: str
> @param file_path: The path of the document to be parsed.
> @rtype: DocumentData
> @returns: An object containing the data of the parsed document.
> """
>
> mime = magic.Magic(mime=True)
> mime_type = mime.from_file(file_path)
> document_type = DocumentType.get_instance(mime_type)
> strategy = cls.strategies[document_type]
> return strategy.extract_document_data(file_path)
More information about the Python-list
mailing list