diff --git a/src-python/trp/__init__.py b/src-python/trp/__init__.py index baa4daf..ddd7d58 100644 --- a/src-python/trp/__init__.py +++ b/src-python/trp/__init__.py @@ -109,7 +109,8 @@ class Geometry: def __init__(self, geometry): boundingBox = geometry["BoundingBox"] polygon = geometry["Polygon"] - bb = BoundingBox(boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"]) + bb = BoundingBox( + boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"]) pgs = [] for pg in polygon: pgs.append(Polygon(pg["X"], pg["Y"])) @@ -249,7 +250,8 @@ def __init__(self, block, blockMap): if ('Relationships' in vkvs): for vitem in vkvs['Relationships']: if (vitem["Type"] == "CHILD"): - self._value = FieldValue(vkvs, vitem['Ids'], blockMap) + self._value = FieldValue( + vkvs, vitem['Ids'], blockMap) else: logger.warning(f"no 'Relationships' in block: {block}") @@ -391,12 +393,13 @@ def __init__(self, block, blockMap, rows): for cid in rs['Ids']: blockType = blockMap[cid]["BlockType"] if (blockType == "CELL"): - child_cell = next((x for x in cells if x.id == cid), None) + child_cell = next( + (x for x in cells if x.id == cid), None) if child_cell != None: child_cell._isChildOfMergedCell = True child_cell._mergedCellParent = self - if len(self._text) == 0 and len(child_cell.text) > 0: - self._text = child_cell.text.strip() + if len(child_cell.text) > 0: + self._text += " " + child_cell.text.strip() if ('EntityTypes' in block and block['EntityTypes']): self._entityTypes = block['EntityTypes'] @@ -437,10 +440,12 @@ def __init__(self, block, blockMap): for cid in rs['Ids']: cell = Cell(blockMap[cid], blockMap) cells.append(cell) - cells.sort(key=lambda cell: (cell.rowIndex, cell.columnIndex)) + cells.sort(key=lambda cell: ( + cell.rowIndex, cell.columnIndex)) for row_index in range(1, max([x.rowIndex for x in cells]) + 1): new_row: Row = Row() - new_row.cells = [x for x in cells if x.rowIndex == row_index] + new_row.cells = [ + x for x in cells if x.rowIndex == row_index] self._rows.append(new_row) elif (rs['Type'] == 'MERGED_CELL'): self._merged_cells_ids = rs['Ids'] @@ -562,11 +567,12 @@ def getLinesInReadingOrder(self): for index, column in enumerate(columns): bbox_left = item.geometry.boundingBox.left bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width - bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width / 2 + bbox_centre = item.geometry.boundingBox.left + \ + item.geometry.boundingBox.width / 2 column_centre = column['left'] + column['right'] / 2 if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right): - #Bbox appears inside the column + # Bbox appears inside the column lines.append([index, item.text]) column_found = True break @@ -663,7 +669,8 @@ def _parseDocumentPagesAndBlockMap(self): if documentPage: documentPage.append(block) else: - logger.error("assumed documentPage not None, but was None") + logger.error( + "assumed documentPage not None, but was None") if (documentPage): documentPages.append({"Blocks": documentPage}) return documentPages, blockMap @@ -692,3 +699,6 @@ def getBlockById(self, blockId): if (self._blockMap and blockId in self._blockMap): block = self._blockMap[blockId] return block + + def deleteBlockById(self, blockId): + self._blockMap.pop(blockId, None) diff --git a/src-python/trp/t_tables.py b/src-python/trp/t_tables.py index 7a27cde..8fc1685 100644 --- a/src-python/trp/t_tables.py +++ b/src-python/trp/t_tables.py @@ -15,6 +15,7 @@ class HeaderFooterType(Enum): NONE = 0 NARROW = 0.5 NORMAL = 1 + WIDE = 2.5 logger = logging.getLogger(__name__) diff --git a/src-python/trp/trp2.py b/src-python/trp/trp2.py index 9a7d4c0..1e59718 100644 --- a/src-python/trp/trp2.py +++ b/src-python/trp/trp2.py @@ -52,6 +52,7 @@ class TextractBlockTypes(Enum): class TextractEntityTypes(Enum): KEY = auto() VALUE = auto() + COLUMN_HEADER = auto() @dataclass(eq=True, repr=True) @@ -778,6 +779,12 @@ def merge_tables(self, table_array_ids: List[List[str]]): cell_block = self.get_block_by_id(cell_id) if cell_block and cell_block.row_index and parent_last_row: cell_block.row_index = parent_last_row + cell_block.row_index + # This is to make sure the child table's headers are merged + # as regular rows into the parent. + if cell_block.entity_types and len(cell_block.entity_types) > 0: + cell_block.entity_types = [ + entity_type for entity_type in cell_block.entity_types if entity_type != TextractEntityTypes.COLUMN_HEADER.name] + if parent_relationships.ids and cell_id not in parent_relationships.ids: parent_relationships.ids.append(cell_id) self.delete_blocks([table_id])