From 1de6d611d51a82994c30bd580644c3d6edd1aef4 Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Sun, 6 Oct 2024 20:44:36 -0700 Subject: [PATCH 01/11] Add TableMerger --- lib/sycamore/sycamore/transforms/llm_query.py | 26 ++- .../sycamore/transforms/merge_elements.py | 153 +++++++++++++++++- 2 files changed, 174 insertions(+), 5 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/llm_query.py b/lib/sycamore/sycamore/transforms/llm_query.py index ef7275f73..7eb32dde1 100644 --- a/lib/sycamore/sycamore/transforms/llm_query.py +++ b/lib/sycamore/sycamore/transforms/llm_query.py @@ -48,6 +48,7 @@ def __init__( llm_kwargs: dict = {}, per_element: bool = True, element_type: Optional[str] = None, + table_cont: Optional[bool] = False, ): self._llm = llm self._prompt = prompt @@ -57,16 +58,25 @@ def __init__( self._format_kwargs = format_kwargs self._number_of_elements = number_of_elements self._element_type = element_type + self._table_cont = table_cont def execute_query(self, document: Document) -> Document: final_prompt = self._prompt element_count = 0 + prev_table = -1 if self._per_element or self._number_of_elements: for idx, element in enumerate(document.elements): if self._element_type and element.type != self._element_type: continue if self._per_element: - document.elements[idx] = self._query_text_object(element) + if not self._table_cont: + document.elements[idx] = self._query_text_object(element) + else: + if prev_table > 0: + document.elements[idx] = self._query_text_object(element, document.elements[prev_table]) + else: + document.elements[idx] = self._query_text_object(element) + prev_table = idx else: final_prompt += "\n" + element["text_representation"] if self._number_of_elements: @@ -83,7 +93,9 @@ def execute_query(self, document: Document) -> Document: return document @timetrace("LLMQueryText") - def _query_text_object(self, object: Union[Document, Element]) -> Union[Document, Element]: + def _query_text_object( + self, object: Union[Document, Element], objectPrev: Element = None + ) -> Union[Document, Element]: if object.text_representation: if self._format_kwargs: prompt = ( @@ -92,10 +104,16 @@ def _query_text_object(self, object: Union[Document, Element]) -> Union[Document .render(doc=object) ) else: - prompt = self._prompt + "\n" + object.text_representation + if objectPrev and objectPrev.text_representation: + prompt = self._prompt + "\n" + objectPrev.text_representation + "\n\n" + object.text_representation + else: + prompt = self._prompt + "\n" + object.text_representation prompt_kwargs = {"prompt": prompt} llm_resp = self._llm.generate(prompt_kwargs=prompt_kwargs, llm_kwargs=self._llm_kwargs) - object["properties"][self._output_property] = llm_resp + if self._table_cont: + object["properties"]["table_continuation"] = llm_resp + else: + object["properties"][self._output_property] = llm_resp return object diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index d81a434f6..dcb714d4d 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -1,13 +1,16 @@ from abc import ABC, abstractmethod from typing import Any, Dict +from collections import defaultdict +import re -from sycamore.data import Document, Element, BoundingBox +from sycamore.data import Document, Element, BoundingBox, Table from sycamore.data.document import DocumentPropertyTypes from sycamore.plan_nodes import SingleThreadUser, NonGPUUser, Node from sycamore.functions.tokenizer import Tokenizer from sycamore.transforms.map import Map from sycamore.utils.time_trace import timetrace +from sycamore.transforms.llm_query import LLMTextQueryAgent class ElementMerger(ABC): @@ -412,6 +415,154 @@ def merge_elements(self, document: Document) -> Document: return document +class TableMerger(ElementMerger): + """ + The ``Table merger`` handles 3 operations + 1. If a text element (Caption, Section-header, Text...) contains the regex pattern anywhere in a page + it is attached to the text_representation of the table on the page. + 2. LLMQuery is used for adding a table_continuation property to table elements. Is the table is + a continuation from a previous table the property is stored as true, else false. + 3. After LLMQuery, table elements which are continuations are merged as one element. + Example: + .. code-block:: python + + llm = OpenAI(OpenAIModels.GPT_4O, api_key = '') + + prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table is a continuation of the first with 100% certainty. Check either of the following:\ + 1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality) in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\ + 2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically be in continutaion of the last row in the first table.\ + Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. Certainty is determinedx if either of the two conditions is true." + + regex_pattern = r"table \d+" + + merger = TableMerger(llm_prompt = prompt, llm=llm) + + context = sycamore.init() + pdf_docset = context.read.binary(paths, binary_format="pdf", regex_pattern= regex_pattern) + .partition(partitioner=ArynPartitioner()) + .merge(merger=merger) + """ + + def __init__(self, regex_pattern=None, llm_prompt=None, llm=None, *args, **kwargs): + self.regex_pattern = regex_pattern + self.llm_prompt = llm_prompt + self.llm = llm + + def merge_elements(self, document: Document) -> Document: + + table_elements = [ele for ele in document.elements if ele.type == "table"] + if len(table_elements) < 1: + return document + if self.regex_pattern: + document.elements = self.customTableHeaderAdditionFilter(document.elements) + if not self.llm_prompt or len(table_elements) < 2: + return document + document = self.process_llm_query(document) + table_elements = [ele for ele in document.elements if ele.type == "table"] + other_elements = [ele for ele in document.elements if ele.type != "table"] + new_table_elements = [table_elements[0]] + for element in table_elements[1:]: + if self.should_merge(new_table_elements[-1], element): + new_table_elements[-1] = self.merge(new_table_elements[-1], element) + else: + new_table_elements.append(element) + other_elements.extend(new_table_elements) + document.elements = other_elements + return document + + def should_merge(self, element1: Element, element2: Element) -> bool: + if "true" in element2["properties"]["table_continuation"].lower(): + return True + return False + + def merge(self, elt1: Element, elt2: Element) -> Element: + + tok1 = elt1.data["token_count"] + tok2 = elt2.data["token_count"] + new_elt = Element() + new_elt.type = "table" + # Merge binary representations by concatenation + if elt1.binary_representation is None or elt2.binary_representation is None: + new_elt.binary_representation = elt1.binary_representation or elt2.binary_representation + else: + new_elt.binary_representation = elt1.binary_representation + elt2.binary_representation + # Merge text representations by concatenation with a newline + if elt1.text_representation is None or elt2.text_representation is None: + new_elt.text_representation = elt1.text_representation or elt2.text_representation + new_elt.data["token_count"] = max(tok1, tok2) + else: + new_elt.text_representation = elt1.text_representation + "\n" + elt2.text_representation + new_elt.data["token_count"] = tok1 + 1 + tok2 + # Merge bbox by taking the coords that make the largest box + # if elt1.bbox is None and elt2.bbox is None: + # pass + # elif elt1.bbox is None or elt2.bbox is None: + # new_elt.bbox = elt1.bbox or elt2.bbox + # else: + # new_elt.bbox = BoundingBox( + # min(elt1.bbox.x1, elt2.bbox.x1), + # min(elt1.bbox.y1, elt2.bbox.y1), + # max(elt1.bbox.x2, elt2.bbox.x2), + # max(elt1.bbox.y2, elt2.bbox.y2), + # ) + # Merge properties by taking the union of the keys + properties = new_elt.properties + for k, v in elt1.properties.items(): + properties[k] = v + if k == DocumentPropertyTypes.PAGE_NUMBER: + properties["page_numbers"] = properties.get("page_numbers", list()) + properties["page_numbers"] = list(set(properties["page_numbers"] + [v])) + for k, v in elt2.properties.items(): + if properties.get(k) is None: + properties[k] = v + # if a page number exists, add it to the set of page numbers for this new element + if k == DocumentPropertyTypes.PAGE_NUMBER: + properties["page_numbers"] = properties.get("page_numbers", list()) + properties["page_numbers"] = list(set(properties["page_numbers"] + [v])) + + new_elt.properties = properties + + return new_elt + + def customTableHeaderAdditionFilter(self, elements): + + dic = defaultdict(str) + + # First pass: capture headers + for ele in elements: + if ele.type in ["table", "Image", "Formula"]: + continue + elif ele.type in ["Text", "Title", "Page-header", "Section-header", "Caption"]: + if ele.text_representation is not None: + text_rep = ele.text_representation.strip().lower() + if text_rep == "": + continue + if re.search(self.regex_pattern, text_rep): + dic[ele["properties"]["page_number"]] = text_rep + " " + + # Second pass: update table elements with headers, done in separate loops since + # table headers can be within table elements as well or after them + for ele in elements: + if ele.type == "table" and isinstance(ele["table"], Table): + ele.text_representation = dic[ele["properties"]["page_number"]] + ele.text_representation + ele["properties"]["table_header"] = dic[ele["properties"]["page_number"]] + + return elements + + def process_llm_query(self, document): + # Here you can implement how to use the LLM prompt on merged elements + llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True) + # Assume you have a method to extract relevant information from merged elements + llm_results = llm_query_agent.execute_query(document) + return llm_results + + def preprocess_element(self, elem: Element) -> Element: + return elem + + def postprocess_element(self, elem: Element) -> Element: + return elem + + class Merge(SingleThreadUser, NonGPUUser, Map): """ Merge Elements into fewer large elements From f50b359aa7faa06080640b972a0173219d3bd759 Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Sun, 6 Oct 2024 20:55:51 -0700 Subject: [PATCH 02/11] Fix merging code --- .../sycamore/transforms/merge_elements.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index dcb714d4d..8314c1c9c 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -477,8 +477,6 @@ def should_merge(self, element1: Element, element2: Element) -> bool: def merge(self, elt1: Element, elt2: Element) -> Element: - tok1 = elt1.data["token_count"] - tok2 = elt2.data["token_count"] new_elt = Element() new_elt.type = "table" # Merge binary representations by concatenation @@ -489,22 +487,8 @@ def merge(self, elt1: Element, elt2: Element) -> Element: # Merge text representations by concatenation with a newline if elt1.text_representation is None or elt2.text_representation is None: new_elt.text_representation = elt1.text_representation or elt2.text_representation - new_elt.data["token_count"] = max(tok1, tok2) else: new_elt.text_representation = elt1.text_representation + "\n" + elt2.text_representation - new_elt.data["token_count"] = tok1 + 1 + tok2 - # Merge bbox by taking the coords that make the largest box - # if elt1.bbox is None and elt2.bbox is None: - # pass - # elif elt1.bbox is None or elt2.bbox is None: - # new_elt.bbox = elt1.bbox or elt2.bbox - # else: - # new_elt.bbox = BoundingBox( - # min(elt1.bbox.x1, elt2.bbox.x1), - # min(elt1.bbox.y1, elt2.bbox.y1), - # max(elt1.bbox.x2, elt2.bbox.x2), - # max(elt1.bbox.y2, elt2.bbox.y2), - # ) # Merge properties by taking the union of the keys properties = new_elt.properties for k, v in elt1.properties.items(): @@ -550,9 +534,7 @@ def customTableHeaderAdditionFilter(self, elements): return elements def process_llm_query(self, document): - # Here you can implement how to use the LLM prompt on merged elements llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True) - # Assume you have a method to extract relevant information from merged elements llm_results = llm_query_agent.execute_query(document) return llm_results From a2e8f785ccf5fe0159b5d2a064c805bf6c0e773d Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Wed, 9 Oct 2024 14:23:11 -0700 Subject: [PATCH 03/11] Some fixes --- lib/sycamore/sycamore/transforms/llm_query.py | 4 ++-- lib/sycamore/sycamore/transforms/merge_elements.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/llm_query.py b/lib/sycamore/sycamore/transforms/llm_query.py index 7eb32dde1..920702bdd 100644 --- a/lib/sycamore/sycamore/transforms/llm_query.py +++ b/lib/sycamore/sycamore/transforms/llm_query.py @@ -72,7 +72,7 @@ def execute_query(self, document: Document) -> Document: if not self._table_cont: document.elements[idx] = self._query_text_object(element) else: - if prev_table > 0: + if prev_table >= 0: document.elements[idx] = self._query_text_object(element, document.elements[prev_table]) else: document.elements[idx] = self._query_text_object(element) @@ -94,7 +94,7 @@ def execute_query(self, document: Document) -> Document: @timetrace("LLMQueryText") def _query_text_object( - self, object: Union[Document, Element], objectPrev: Element = None + self, object: Union[Document, Element], objectPrev: Optional[Element] = None ) -> Union[Document, Element]: if object.text_representation: if self._format_kwargs: diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 8314c1c9c..43835518c 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict +from typing import Any, Dict, Pattern from collections import defaultdict import re @@ -11,6 +11,8 @@ from sycamore.transforms.map import Map from sycamore.utils.time_trace import timetrace from sycamore.transforms.llm_query import LLMTextQueryAgent +from sycamore.llms import LLM + class ElementMerger(ABC): @@ -443,7 +445,7 @@ class TableMerger(ElementMerger): .merge(merger=merger) """ - def __init__(self, regex_pattern=None, llm_prompt=None, llm=None, *args, **kwargs): + def __init__(self, regex_pattern: Optional[Pattern] = None, llm_prompt: Optional[str] = None, llm=Optional[LLM] = None, *args, **kwargs): self.regex_pattern = regex_pattern self.llm_prompt = llm_prompt self.llm = llm @@ -471,9 +473,7 @@ def merge_elements(self, document: Document) -> Document: return document def should_merge(self, element1: Element, element2: Element) -> bool: - if "true" in element2["properties"]["table_continuation"].lower(): - return True - return False + return "true" in element2["properties"]["table_continuation"].lower() def merge(self, elt1: Element, elt2: Element) -> Element: From e609221841bcbff445f818da087bdcce39a3e034 Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Wed, 9 Oct 2024 17:24:24 -0700 Subject: [PATCH 04/11] Fix bug --- lib/sycamore/sycamore/transforms/merge_elements.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 43835518c..0a047c4ae 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, Pattern +from typing import Any, Dict, Pattern, Optional from collections import defaultdict import re @@ -14,7 +14,6 @@ from sycamore.llms import LLM - class ElementMerger(ABC): @abstractmethod def should_merge(self, element1: Element, element2: Element) -> bool: @@ -445,7 +444,14 @@ class TableMerger(ElementMerger): .merge(merger=merger) """ - def __init__(self, regex_pattern: Optional[Pattern] = None, llm_prompt: Optional[str] = None, llm=Optional[LLM] = None, *args, **kwargs): + def __init__( + self, + regex_pattern: Optional[Pattern] = None, + llm_prompt: Optional[str] = None, + llm: Optional[LLM] = None, + *args, + **kwargs + ): self.regex_pattern = regex_pattern self.llm_prompt = llm_prompt self.llm = llm From 6dad1d3904e206a86c4c2a9eddce0421e6911cbc Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Wed, 9 Oct 2024 18:11:33 -0700 Subject: [PATCH 05/11] Lint --- lib/sycamore/sycamore/transforms/merge_elements.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 0a047c4ae..11ce18199 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -429,10 +429,14 @@ class TableMerger(ElementMerger): llm = OpenAI(OpenAIModels.GPT_4O, api_key = '') - prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table is a continuation of the first with 100% certainty. Check either of the following:\ - 1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality) in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\ - 2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically be in continutaion of the last row in the first table.\ - Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. Certainty is determinedx if either of the two conditions is true." + prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table\ + is a continuation of the first with 100% certainty. Check either of the following:\ + 1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality)\ + in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\ + 2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically\ + be in continutaion of the last row in the first table.\ + Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \ + Certainty is determined if either of the two conditions is true." regex_pattern = r"table \d+" From af172eeb4d6033300c34573526253065dbf08370 Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Sun, 13 Oct 2024 01:36:56 -0700 Subject: [PATCH 06/11] Fix till empty text_representation is released in sycamore --- lib/sycamore/sycamore/transforms/merge_elements.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 11ce18199..1fdf81df3 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -483,7 +483,9 @@ def merge_elements(self, document: Document) -> Document: return document def should_merge(self, element1: Element, element2: Element) -> bool: - return "true" in element2["properties"]["table_continuation"].lower() + if "table_continuation" in element2["properties"]: + return "true" in element2["properties"]["table_continuation"].lower() + return False def merge(self, elt1: Element, elt2: Element) -> Element: From 402f3f5f3237e009dd71a401f43f2231853ffc0d Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Tue, 15 Oct 2024 18:07:53 -0700 Subject: [PATCH 07/11] Fixes --- lib/sycamore/sycamore/data/__init__.py | 2 +- .../sycamore/transforms/merge_elements.py | 67 ++++++++++++++----- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/lib/sycamore/sycamore/data/__init__.py b/lib/sycamore/sycamore/data/__init__.py index 737d008c3..3cafb8cdb 100644 --- a/lib/sycamore/sycamore/data/__init__.py +++ b/lib/sycamore/sycamore/data/__init__.py @@ -1,5 +1,5 @@ from sycamore.data.bbox import BoundingBox -from sycamore.data.table import Table +from sycamore.data.table import Table, TableCell from sycamore.data.element import Element, ImageElement, TableElement from sycamore.data.document import ( Document, diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 1fdf81df3..72507c0a4 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -4,7 +4,7 @@ import re -from sycamore.data import Document, Element, BoundingBox, Table +from sycamore.data import Document, Element, BoundingBox, Table, TableElement, TableCell from sycamore.data.document import DocumentPropertyTypes from sycamore.plan_nodes import SingleThreadUser, NonGPUUser, Node from sycamore.functions.tokenizer import Tokenizer @@ -429,12 +429,13 @@ class TableMerger(ElementMerger): llm = OpenAI(OpenAIModels.GPT_4O, api_key = '') - prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table\ - is a continuation of the first with 100% certainty. Check either of the following:\ - 1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality)\ - in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\ - 2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically\ - be in continutaion of the last row in the first table.\ + prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine\ + if the second table is a continuation of the first with 100% certainty. Check either of the following:\ + 1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors \ + because of OCR quality) in both tables. If the headers are almost the same check the number of columns,\ + they should be roughly the same.\ + 2. Missing headers: If the header/columns in the second table are missing, then the first row in the + second table should logically be in continutaion of the last row in the first table.\ Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \ Certainty is determined if either of the two conditions is true." @@ -454,7 +455,7 @@ def __init__( llm_prompt: Optional[str] = None, llm: Optional[LLM] = None, *args, - **kwargs + **kwargs, ): self.regex_pattern = regex_pattern self.llm_prompt = llm_prompt @@ -482,15 +483,42 @@ def merge_elements(self, document: Document) -> Document: document.elements = other_elements return document - def should_merge(self, element1: Element, element2: Element) -> bool: + def should_merge(self, element1: TableElement, element2: TableElement) -> bool: if "table_continuation" in element2["properties"]: return "true" in element2["properties"]["table_continuation"].lower() return False - def merge(self, elt1: Element, elt2: Element) -> Element: + def merge(self, elt1: TableElement, elt2: TableElement) -> TableElement: + + # Combine the cells, adjusting the row indices for the second table + offset_row = elt1.table.num_rows + merged_cells = elt1.table.cells + [ + TableCell( + content=cell.content, + rows=[r + offset_row for r in cell.rows], + cols=cell.cols, + is_header=cell.is_header, + bbox=cell.bbox, + properties=cell.properties, + ) + for cell in elt2.table.cells + ] + + # Create a new Table object with merged cells + merged_table = Table(cells=merged_cells) + + title1 = elt1.data["properties"].get("title", "") or "" + title2 = elt2.data["properties"].get("title", "") or "" + merged_title = f"{title1} / {title2}".strip(" / ") + # Create a new TableElement with the merged table and combined metadata + new_elt = TableElement( + title=merged_title if merged_title else None, + columns=elt1.columns if elt1.columns else elt2.columns, + rows=elt1.rows + elt2.rows if elt1.rows and elt2.rows else None, + table=merged_table, + tokens=elt1.tokens + elt2.tokens if elt1.tokens and elt2.tokens else None, + ) - new_elt = Element() - new_elt.type = "table" # Merge binary representations by concatenation if elt1.binary_representation is None or elt2.binary_representation is None: new_elt.binary_representation = elt1.binary_representation or elt2.binary_representation @@ -530,7 +558,7 @@ def customTableHeaderAdditionFilter(self, elements): continue elif ele.type in ["Text", "Title", "Page-header", "Section-header", "Caption"]: if ele.text_representation is not None: - text_rep = ele.text_representation.strip().lower() + text_rep = ele.text_representation.strip() if text_rep == "": continue if re.search(self.regex_pattern, text_rep): @@ -541,19 +569,24 @@ def customTableHeaderAdditionFilter(self, elements): for ele in elements: if ele.type == "table" and isinstance(ele["table"], Table): ele.text_representation = dic[ele["properties"]["page_number"]] + ele.text_representation - ele["properties"]["table_header"] = dic[ele["properties"]["page_number"]] - + if ele["properties"]["title"]: + ele["properties"]["title"] = ( + ele["properties"]["title"] + "\n" + dic[ele["properties"]["page_number"]] + ) + else: + ele["properties"]["title"] = dic[ele["properties"]["page_number"]] return elements def process_llm_query(self, document): + # TO-DO: Add async llm query llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True) llm_results = llm_query_agent.execute_query(document) return llm_results - def preprocess_element(self, elem: Element) -> Element: + def preprocess_element(self, elem: TableElement) -> TableElement: return elem - def postprocess_element(self, elem: Element) -> Element: + def postprocess_element(self, elem: TableElement) -> TableElement: return elem From 2a6c22f946faf837b0ee81c6f5bf659e47c123dc Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Tue, 15 Oct 2024 18:14:02 -0700 Subject: [PATCH 08/11] Fixes --- lib/sycamore/sycamore/data/__init__.py | 1 + lib/sycamore/sycamore/transforms/merge_elements.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/lib/sycamore/sycamore/data/__init__.py b/lib/sycamore/sycamore/data/__init__.py index 3cafb8cdb..b02918da8 100644 --- a/lib/sycamore/sycamore/data/__init__.py +++ b/lib/sycamore/sycamore/data/__init__.py @@ -21,4 +21,5 @@ "OpenSearchQuery", "OpenSearchQueryResult", "Table", + "TableCell", ] diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 72507c0a4..91bb295b6 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -12,6 +12,7 @@ from sycamore.utils.time_trace import timetrace from sycamore.transforms.llm_query import LLMTextQueryAgent from sycamore.llms import LLM +from sycamore.utils.bbox_sort import bbox_sort_document class ElementMerger(ABC): @@ -481,6 +482,8 @@ def merge_elements(self, document: Document) -> Document: new_table_elements.append(element) other_elements.extend(new_table_elements) document.elements = other_elements + bbox_sort_document(document) + return document def should_merge(self, element1: TableElement, element2: TableElement) -> bool: From 89f6956ff9920b13207f79ba869e98f76f48b74c Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Tue, 15 Oct 2024 18:29:02 -0700 Subject: [PATCH 09/11] bbox & parent class fix --- .../sycamore/transforms/merge_elements.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 91bb295b6..30e6180e8 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -434,7 +434,7 @@ class TableMerger(ElementMerger): if the second table is a continuation of the first with 100% certainty. Check either of the following:\ 1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors \ because of OCR quality) in both tables. If the headers are almost the same check the number of columns,\ - they should be roughly the same.\ + they should be roughly the same. \ 2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically be in continutaion of the last row in the first table.\ Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \ @@ -486,12 +486,12 @@ def merge_elements(self, document: Document) -> Document: return document - def should_merge(self, element1: TableElement, element2: TableElement) -> bool: + def should_merge(self, element1: Element, element2: Element) -> bool: if "table_continuation" in element2["properties"]: return "true" in element2["properties"]["table_continuation"].lower() return False - def merge(self, elt1: TableElement, elt2: TableElement) -> TableElement: + def merge(self, elt1: Element, elt2: Element) -> Element: # Combine the cells, adjusting the row indices for the second table offset_row = elt1.table.num_rows @@ -547,6 +547,18 @@ def merge(self, elt1: TableElement, elt2: TableElement) -> TableElement: properties["page_numbers"] = properties.get("page_numbers", list()) properties["page_numbers"] = list(set(properties["page_numbers"] + [v])) + # TO-DO: Currently bbox points to first table bbox, and other bboxs are removed in + # this process, potential fix can be to have a list of bboxs, and change label + # of bbox after first as "table_continuation" + if elt1.bbox is None or elt2.bbox is None: + new_elt.bbox = elt1.bbox or elt2.bbox + else: + new_elt.bbox = BoundingBox( + elt1.bbox.x1, + elt1.bbox.y1, + elt1.bbox.x2, + elt1.bbox.y2, + ) new_elt.properties = properties return new_elt @@ -586,10 +598,10 @@ def process_llm_query(self, document): llm_results = llm_query_agent.execute_query(document) return llm_results - def preprocess_element(self, elem: TableElement) -> TableElement: + def preprocess_element(self, elem: Element) -> Element: return elem - def postprocess_element(self, elem: TableElement) -> TableElement: + def postprocess_element(self, elem: Element) -> Element: return elem From ac8afbf49a75234c52367aca80198ef9c06a74e9 Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Tue, 15 Oct 2024 18:44:58 -0700 Subject: [PATCH 10/11] mypy --- lib/sycamore/sycamore/transforms/merge_elements.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index 30e6180e8..a70efd4bb 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -493,6 +493,9 @@ def should_merge(self, element1: Element, element2: Element) -> bool: def merge(self, elt1: Element, elt2: Element) -> Element: + # Check if both elements are TableElements + if not isinstance(elt1, TableElement) or not isinstance(elt2, TableElement): + raise TypeError("Both elements must be of type TableElement to perform merging.") # Combine the cells, adjusting the row indices for the second table offset_row = elt1.table.num_rows merged_cells = elt1.table.cells + [ From e8e8e2288c76fd5c1edcbd5dcf4ebc318db1f5f9 Mon Sep 17 00:00:00 2001 From: Dhruv Kaliraman Date: Tue, 15 Oct 2024 18:56:44 -0700 Subject: [PATCH 11/11] mypy --- lib/sycamore/sycamore/transforms/merge_elements.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py index a70efd4bb..1ed1e1ef0 100644 --- a/lib/sycamore/sycamore/transforms/merge_elements.py +++ b/lib/sycamore/sycamore/transforms/merge_elements.py @@ -497,6 +497,9 @@ def merge(self, elt1: Element, elt2: Element) -> Element: if not isinstance(elt1, TableElement) or not isinstance(elt2, TableElement): raise TypeError("Both elements must be of type TableElement to perform merging.") # Combine the cells, adjusting the row indices for the second table + if elt1.table is None or elt2.table is None: + raise ValueError("Both elements must have a table to perform merging.") + offset_row = elt1.table.num_rows merged_cells = elt1.table.cells + [ TableCell(