From 1de6d611d51a82994c30bd580644c3d6edd1aef4 Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Sun, 6 Oct 2024 20:44:36 -0700
Subject: [PATCH 01/11] Add TableMerger

---
 lib/sycamore/sycamore/transforms/llm_query.py |  26 ++-
 .../sycamore/transforms/merge_elements.py     | 153 +++++++++++++++++-
 2 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/lib/sycamore/sycamore/transforms/llm_query.py b/lib/sycamore/sycamore/transforms/llm_query.py
index ef7275f73..7eb32dde1 100644
--- a/lib/sycamore/sycamore/transforms/llm_query.py
+++ b/lib/sycamore/sycamore/transforms/llm_query.py
@@ -48,6 +48,7 @@ def __init__(
         llm_kwargs: dict = {},
         per_element: bool = True,
         element_type: Optional[str] = None,
+        table_cont: Optional[bool] = False,
     ):
         self._llm = llm
         self._prompt = prompt
@@ -57,16 +58,25 @@ def __init__(
         self._format_kwargs = format_kwargs
         self._number_of_elements = number_of_elements
         self._element_type = element_type
+        self._table_cont = table_cont
 
     def execute_query(self, document: Document) -> Document:
         final_prompt = self._prompt
         element_count = 0
+        prev_table = -1
         if self._per_element or self._number_of_elements:
             for idx, element in enumerate(document.elements):
                 if self._element_type and element.type != self._element_type:
                     continue
                 if self._per_element:
-                    document.elements[idx] = self._query_text_object(element)
+                    if not self._table_cont:
+                        document.elements[idx] = self._query_text_object(element)
+                    else:
+                        if prev_table > 0:
+                            document.elements[idx] = self._query_text_object(element, document.elements[prev_table])
+                        else:
+                            document.elements[idx] = self._query_text_object(element)
+                        prev_table = idx
                 else:
                     final_prompt += "\n" + element["text_representation"]
                 if self._number_of_elements:
@@ -83,7 +93,9 @@ def execute_query(self, document: Document) -> Document:
         return document
 
     @timetrace("LLMQueryText")
-    def _query_text_object(self, object: Union[Document, Element]) -> Union[Document, Element]:
+    def _query_text_object(
+        self, object: Union[Document, Element], objectPrev: Element = None
+    ) -> Union[Document, Element]:
         if object.text_representation:
             if self._format_kwargs:
                 prompt = (
@@ -92,10 +104,16 @@ def _query_text_object(self, object: Union[Document, Element]) -> Union[Document
                     .render(doc=object)
                 )
             else:
-                prompt = self._prompt + "\n" + object.text_representation
+                if objectPrev and objectPrev.text_representation:
+                    prompt = self._prompt + "\n" + objectPrev.text_representation + "\n\n" + object.text_representation
+                else:
+                    prompt = self._prompt + "\n" + object.text_representation
             prompt_kwargs = {"prompt": prompt}
             llm_resp = self._llm.generate(prompt_kwargs=prompt_kwargs, llm_kwargs=self._llm_kwargs)
-            object["properties"][self._output_property] = llm_resp
+            if self._table_cont:
+                object["properties"]["table_continuation"] = llm_resp
+            else:
+                object["properties"][self._output_property] = llm_resp
         return object
 
 
diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index d81a434f6..dcb714d4d 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -1,13 +1,16 @@
 from abc import ABC, abstractmethod
 from typing import Any, Dict
+from collections import defaultdict
+import re
 
 
-from sycamore.data import Document, Element, BoundingBox
+from sycamore.data import Document, Element, BoundingBox, Table
 from sycamore.data.document import DocumentPropertyTypes
 from sycamore.plan_nodes import SingleThreadUser, NonGPUUser, Node
 from sycamore.functions.tokenizer import Tokenizer
 from sycamore.transforms.map import Map
 from sycamore.utils.time_trace import timetrace
+from sycamore.transforms.llm_query import LLMTextQueryAgent
 
 
 class ElementMerger(ABC):
@@ -412,6 +415,154 @@ def merge_elements(self, document: Document) -> Document:
         return document
 
 
+class TableMerger(ElementMerger):
+    """
+    The ``Table merger`` handles 3 operations
+    1. If a text element (Caption, Section-header, Text...) contains the regex pattern anywhere in a page
+     it is attached to the text_representation of the table on the page.
+    2. LLMQuery is used for adding a table_continuation property to table elements. Is the table is
+     a continuation from a previous table the property is stored as true, else false.
+    3. After LLMQuery, table elements which are continuations are merged as one element.
+    Example:
+         .. code-block:: python
+
+            llm = OpenAI(OpenAIModels.GPT_4O, api_key = '')
+
+            prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table is a continuation of the first with 100% certainty. Check either of the following:\
+            1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality) in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\
+            2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically be in continutaion of the last row in the first table.\
+            Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. Certainty is determinedx if either of the two conditions is true."
+
+            regex_pattern = r"table \d+"
+
+            merger = TableMerger(llm_prompt = prompt, llm=llm)
+
+            context = sycamore.init()
+            pdf_docset = context.read.binary(paths, binary_format="pdf", regex_pattern= regex_pattern)
+                .partition(partitioner=ArynPartitioner())
+                .merge(merger=merger)
+    """
+
+    def __init__(self, regex_pattern=None, llm_prompt=None, llm=None, *args, **kwargs):
+        self.regex_pattern = regex_pattern
+        self.llm_prompt = llm_prompt
+        self.llm = llm
+
+    def merge_elements(self, document: Document) -> Document:
+
+        table_elements = [ele for ele in document.elements if ele.type == "table"]
+        if len(table_elements) < 1:
+            return document
+        if self.regex_pattern:
+            document.elements = self.customTableHeaderAdditionFilter(document.elements)
+        if not self.llm_prompt or len(table_elements) < 2:
+            return document
+        document = self.process_llm_query(document)
+        table_elements = [ele for ele in document.elements if ele.type == "table"]
+        other_elements = [ele for ele in document.elements if ele.type != "table"]
+        new_table_elements = [table_elements[0]]
+        for element in table_elements[1:]:
+            if self.should_merge(new_table_elements[-1], element):
+                new_table_elements[-1] = self.merge(new_table_elements[-1], element)
+            else:
+                new_table_elements.append(element)
+        other_elements.extend(new_table_elements)
+        document.elements = other_elements
+        return document
+
+    def should_merge(self, element1: Element, element2: Element) -> bool:
+        if "true" in element2["properties"]["table_continuation"].lower():
+            return True
+        return False
+
+    def merge(self, elt1: Element, elt2: Element) -> Element:
+
+        tok1 = elt1.data["token_count"]
+        tok2 = elt2.data["token_count"]
+        new_elt = Element()
+        new_elt.type = "table"
+        # Merge binary representations by concatenation
+        if elt1.binary_representation is None or elt2.binary_representation is None:
+            new_elt.binary_representation = elt1.binary_representation or elt2.binary_representation
+        else:
+            new_elt.binary_representation = elt1.binary_representation + elt2.binary_representation
+        # Merge text representations by concatenation with a newline
+        if elt1.text_representation is None or elt2.text_representation is None:
+            new_elt.text_representation = elt1.text_representation or elt2.text_representation
+            new_elt.data["token_count"] = max(tok1, tok2)
+        else:
+            new_elt.text_representation = elt1.text_representation + "\n" + elt2.text_representation
+            new_elt.data["token_count"] = tok1 + 1 + tok2
+        # Merge bbox by taking the coords that make the largest box
+        # if elt1.bbox is None and elt2.bbox is None:
+        #     pass
+        # elif elt1.bbox is None or elt2.bbox is None:
+        #     new_elt.bbox = elt1.bbox or elt2.bbox
+        # else:
+        #     new_elt.bbox = BoundingBox(
+        #         min(elt1.bbox.x1, elt2.bbox.x1),
+        #         min(elt1.bbox.y1, elt2.bbox.y1),
+        #         max(elt1.bbox.x2, elt2.bbox.x2),
+        #         max(elt1.bbox.y2, elt2.bbox.y2),
+        #     )
+        # Merge properties by taking the union of the keys
+        properties = new_elt.properties
+        for k, v in elt1.properties.items():
+            properties[k] = v
+            if k == DocumentPropertyTypes.PAGE_NUMBER:
+                properties["page_numbers"] = properties.get("page_numbers", list())
+                properties["page_numbers"] = list(set(properties["page_numbers"] + [v]))
+        for k, v in elt2.properties.items():
+            if properties.get(k) is None:
+                properties[k] = v
+            # if a page number exists, add it to the set of page numbers for this new element
+            if k == DocumentPropertyTypes.PAGE_NUMBER:
+                properties["page_numbers"] = properties.get("page_numbers", list())
+                properties["page_numbers"] = list(set(properties["page_numbers"] + [v]))
+
+        new_elt.properties = properties
+
+        return new_elt
+
+    def customTableHeaderAdditionFilter(self, elements):
+
+        dic = defaultdict(str)
+
+        # First pass: capture headers
+        for ele in elements:
+            if ele.type in ["table", "Image", "Formula"]:
+                continue
+            elif ele.type in ["Text", "Title", "Page-header", "Section-header", "Caption"]:
+                if ele.text_representation is not None:
+                    text_rep = ele.text_representation.strip().lower()
+                if text_rep == "":
+                    continue
+                if re.search(self.regex_pattern, text_rep):
+                    dic[ele["properties"]["page_number"]] = text_rep + " "
+
+        # Second pass: update table elements with headers, done in separate loops since
+        # table headers can be within table elements as well or after them
+        for ele in elements:
+            if ele.type == "table" and isinstance(ele["table"], Table):
+                ele.text_representation = dic[ele["properties"]["page_number"]] + ele.text_representation
+                ele["properties"]["table_header"] = dic[ele["properties"]["page_number"]]
+
+        return elements
+
+    def process_llm_query(self, document):
+        # Here you can implement how to use the LLM prompt on merged elements
+        llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True)
+        # Assume you have a method to extract relevant information from merged elements
+        llm_results = llm_query_agent.execute_query(document)
+        return llm_results
+
+    def preprocess_element(self, elem: Element) -> Element:
+        return elem
+
+    def postprocess_element(self, elem: Element) -> Element:
+        return elem
+
+
 class Merge(SingleThreadUser, NonGPUUser, Map):
     """
     Merge Elements into fewer large elements

From f50b359aa7faa06080640b972a0173219d3bd759 Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Sun, 6 Oct 2024 20:55:51 -0700
Subject: [PATCH 02/11] Fix merging code

---
 .../sycamore/transforms/merge_elements.py      | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index dcb714d4d..8314c1c9c 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -477,8 +477,6 @@ def should_merge(self, element1: Element, element2: Element) -> bool:
 
     def merge(self, elt1: Element, elt2: Element) -> Element:
 
-        tok1 = elt1.data["token_count"]
-        tok2 = elt2.data["token_count"]
         new_elt = Element()
         new_elt.type = "table"
         # Merge binary representations by concatenation
@@ -489,22 +487,8 @@ def merge(self, elt1: Element, elt2: Element) -> Element:
         # Merge text representations by concatenation with a newline
         if elt1.text_representation is None or elt2.text_representation is None:
             new_elt.text_representation = elt1.text_representation or elt2.text_representation
-            new_elt.data["token_count"] = max(tok1, tok2)
         else:
             new_elt.text_representation = elt1.text_representation + "\n" + elt2.text_representation
-            new_elt.data["token_count"] = tok1 + 1 + tok2
-        # Merge bbox by taking the coords that make the largest box
-        # if elt1.bbox is None and elt2.bbox is None:
-        #     pass
-        # elif elt1.bbox is None or elt2.bbox is None:
-        #     new_elt.bbox = elt1.bbox or elt2.bbox
-        # else:
-        #     new_elt.bbox = BoundingBox(
-        #         min(elt1.bbox.x1, elt2.bbox.x1),
-        #         min(elt1.bbox.y1, elt2.bbox.y1),
-        #         max(elt1.bbox.x2, elt2.bbox.x2),
-        #         max(elt1.bbox.y2, elt2.bbox.y2),
-        #     )
         # Merge properties by taking the union of the keys
         properties = new_elt.properties
         for k, v in elt1.properties.items():
@@ -550,9 +534,7 @@ def customTableHeaderAdditionFilter(self, elements):
         return elements
 
     def process_llm_query(self, document):
-        # Here you can implement how to use the LLM prompt on merged elements
         llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True)
-        # Assume you have a method to extract relevant information from merged elements
         llm_results = llm_query_agent.execute_query(document)
         return llm_results
 

From a2e8f785ccf5fe0159b5d2a064c805bf6c0e773d Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Wed, 9 Oct 2024 14:23:11 -0700
Subject: [PATCH 03/11] Some fixes

---
 lib/sycamore/sycamore/transforms/llm_query.py      |  4 ++--
 lib/sycamore/sycamore/transforms/merge_elements.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/sycamore/sycamore/transforms/llm_query.py b/lib/sycamore/sycamore/transforms/llm_query.py
index 7eb32dde1..920702bdd 100644
--- a/lib/sycamore/sycamore/transforms/llm_query.py
+++ b/lib/sycamore/sycamore/transforms/llm_query.py
@@ -72,7 +72,7 @@ def execute_query(self, document: Document) -> Document:
                     if not self._table_cont:
                         document.elements[idx] = self._query_text_object(element)
                     else:
-                        if prev_table > 0:
+                        if prev_table >= 0:
                             document.elements[idx] = self._query_text_object(element, document.elements[prev_table])
                         else:
                             document.elements[idx] = self._query_text_object(element)
@@ -94,7 +94,7 @@ def execute_query(self, document: Document) -> Document:
 
     @timetrace("LLMQueryText")
     def _query_text_object(
-        self, object: Union[Document, Element], objectPrev: Element = None
+        self, object: Union[Document, Element], objectPrev: Optional[Element] = None
     ) -> Union[Document, Element]:
         if object.text_representation:
             if self._format_kwargs:
diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 8314c1c9c..43835518c 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict
+from typing import Any, Dict, Pattern
 from collections import defaultdict
 import re
 
@@ -11,6 +11,8 @@
 from sycamore.transforms.map import Map
 from sycamore.utils.time_trace import timetrace
 from sycamore.transforms.llm_query import LLMTextQueryAgent
+from sycamore.llms import LLM
+
 
 
 class ElementMerger(ABC):
@@ -443,7 +445,7 @@ class TableMerger(ElementMerger):
                 .merge(merger=merger)
     """
 
-    def __init__(self, regex_pattern=None, llm_prompt=None, llm=None, *args, **kwargs):
+    def __init__(self, regex_pattern: Optional[Pattern] = None, llm_prompt: Optional[str] = None, llm=Optional[LLM] = None, *args, **kwargs):
         self.regex_pattern = regex_pattern
         self.llm_prompt = llm_prompt
         self.llm = llm
@@ -471,9 +473,7 @@ def merge_elements(self, document: Document) -> Document:
         return document
 
     def should_merge(self, element1: Element, element2: Element) -> bool:
-        if "true" in element2["properties"]["table_continuation"].lower():
-            return True
-        return False
+        return "true" in element2["properties"]["table_continuation"].lower()
 
     def merge(self, elt1: Element, elt2: Element) -> Element:
 

From e609221841bcbff445f818da087bdcce39a3e034 Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Wed, 9 Oct 2024 17:24:24 -0700
Subject: [PATCH 04/11] Fix bug

---
 lib/sycamore/sycamore/transforms/merge_elements.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 43835518c..0a047c4ae 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Pattern
+from typing import Any, Dict, Pattern, Optional
 from collections import defaultdict
 import re
 
@@ -14,7 +14,6 @@
 from sycamore.llms import LLM
 
 
-
 class ElementMerger(ABC):
     @abstractmethod
     def should_merge(self, element1: Element, element2: Element) -> bool:
@@ -445,7 +444,14 @@ class TableMerger(ElementMerger):
                 .merge(merger=merger)
     """
 
-    def __init__(self, regex_pattern: Optional[Pattern] = None, llm_prompt: Optional[str] = None, llm=Optional[LLM] = None, *args, **kwargs):
+    def __init__(
+        self,
+        regex_pattern: Optional[Pattern] = None,
+        llm_prompt: Optional[str] = None,
+        llm: Optional[LLM] = None,
+        *args,
+        **kwargs
+    ):
         self.regex_pattern = regex_pattern
         self.llm_prompt = llm_prompt
         self.llm = llm

From 6dad1d3904e206a86c4c2a9eddce0421e6911cbc Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Wed, 9 Oct 2024 18:11:33 -0700
Subject: [PATCH 05/11] Lint

---
 lib/sycamore/sycamore/transforms/merge_elements.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 0a047c4ae..11ce18199 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -429,10 +429,14 @@ class TableMerger(ElementMerger):
 
             llm = OpenAI(OpenAIModels.GPT_4O, api_key = '')
 
-            prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table is a continuation of the first with 100% certainty. Check either of the following:\
-            1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality) in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\
-            2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically be in continutaion of the last row in the first table.\
-            Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. Certainty is determinedx if either of the two conditions is true."
+            prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table\
+                      is a continuation of the first with 100% certainty. Check either of the following:\
+            1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality)\
+               in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\
+            2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically\
+               be in continutaion of the last row in the first table.\
+            Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \
+            Certainty is determined if either of the two conditions is true."
 
             regex_pattern = r"table \d+"
 

From af172eeb4d6033300c34573526253065dbf08370 Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Sun, 13 Oct 2024 01:36:56 -0700
Subject: [PATCH 06/11] Fix till empty text_representation is released in
 sycamore

---
 lib/sycamore/sycamore/transforms/merge_elements.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 11ce18199..1fdf81df3 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -483,7 +483,9 @@ def merge_elements(self, document: Document) -> Document:
         return document
 
     def should_merge(self, element1: Element, element2: Element) -> bool:
-        return "true" in element2["properties"]["table_continuation"].lower()
+        if "table_continuation" in element2["properties"]:
+            return "true" in element2["properties"]["table_continuation"].lower()
+        return False
 
     def merge(self, elt1: Element, elt2: Element) -> Element:
 

From 402f3f5f3237e009dd71a401f43f2231853ffc0d Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Tue, 15 Oct 2024 18:07:53 -0700
Subject: [PATCH 07/11] Fixes

---
 lib/sycamore/sycamore/data/__init__.py        |  2 +-
 .../sycamore/transforms/merge_elements.py     | 67 ++++++++++++++-----
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/lib/sycamore/sycamore/data/__init__.py b/lib/sycamore/sycamore/data/__init__.py
index 737d008c3..3cafb8cdb 100644
--- a/lib/sycamore/sycamore/data/__init__.py
+++ b/lib/sycamore/sycamore/data/__init__.py
@@ -1,5 +1,5 @@
 from sycamore.data.bbox import BoundingBox
-from sycamore.data.table import Table
+from sycamore.data.table import Table, TableCell
 from sycamore.data.element import Element, ImageElement, TableElement
 from sycamore.data.document import (
     Document,
diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 1fdf81df3..72507c0a4 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -4,7 +4,7 @@
 import re
 
 
-from sycamore.data import Document, Element, BoundingBox, Table
+from sycamore.data import Document, Element, BoundingBox, Table, TableElement, TableCell
 from sycamore.data.document import DocumentPropertyTypes
 from sycamore.plan_nodes import SingleThreadUser, NonGPUUser, Node
 from sycamore.functions.tokenizer import Tokenizer
@@ -429,12 +429,13 @@ class TableMerger(ElementMerger):
 
             llm = OpenAI(OpenAIModels.GPT_4O, api_key = '')
 
-            prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine if the second table\
-                      is a continuation of the first with 100% certainty. Check either of the following:\
-            1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors because of OCR quality)\
-               in both tables. If the headers are almost the same check the number of columns, they should be roughly the same.\
-            2. Missing headers: If the header/columns in the second table are missing, then the first row in the second table should logically\
-               be in continutaion of the last row in the first table.\
+            prompt = "Analyze two CSV tables that may be parts of a single table split across pages. Determine\
+            if the second table is a continuation of the first with 100% certainty. Check either of the following:\
+            1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors \
+            because of OCR quality) in both tables. If the headers are almost the same check the number of columns,\
+                 they should be roughly the same.\
+            2. Missing headers: If the header/columns in the second table are missing, then the first row in the
+            second table should logically be in continutaion of the last row in the first table.\
             Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \
             Certainty is determined if either of the two conditions is true."
 
@@ -454,7 +455,7 @@ def __init__(
         llm_prompt: Optional[str] = None,
         llm: Optional[LLM] = None,
         *args,
-        **kwargs
+        **kwargs,
     ):
         self.regex_pattern = regex_pattern
         self.llm_prompt = llm_prompt
@@ -482,15 +483,42 @@ def merge_elements(self, document: Document) -> Document:
         document.elements = other_elements
         return document
 
-    def should_merge(self, element1: Element, element2: Element) -> bool:
+    def should_merge(self, element1: TableElement, element2: TableElement) -> bool:
         if "table_continuation" in element2["properties"]:
             return "true" in element2["properties"]["table_continuation"].lower()
         return False
 
-    def merge(self, elt1: Element, elt2: Element) -> Element:
+    def merge(self, elt1: TableElement, elt2: TableElement) -> TableElement:
+
+        # Combine the cells, adjusting the row indices for the second table
+        offset_row = elt1.table.num_rows
+        merged_cells = elt1.table.cells + [
+            TableCell(
+                content=cell.content,
+                rows=[r + offset_row for r in cell.rows],
+                cols=cell.cols,
+                is_header=cell.is_header,
+                bbox=cell.bbox,
+                properties=cell.properties,
+            )
+            for cell in elt2.table.cells
+        ]
+
+        # Create a new Table object with merged cells
+        merged_table = Table(cells=merged_cells)
+
+        title1 = elt1.data["properties"].get("title", "") or ""
+        title2 = elt2.data["properties"].get("title", "") or ""
+        merged_title = f"{title1} / {title2}".strip(" / ")
+        # Create a new TableElement with the merged table and combined metadata
+        new_elt = TableElement(
+            title=merged_title if merged_title else None,
+            columns=elt1.columns if elt1.columns else elt2.columns,
+            rows=elt1.rows + elt2.rows if elt1.rows and elt2.rows else None,
+            table=merged_table,
+            tokens=elt1.tokens + elt2.tokens if elt1.tokens and elt2.tokens else None,
+        )
 
-        new_elt = Element()
-        new_elt.type = "table"
         # Merge binary representations by concatenation
         if elt1.binary_representation is None or elt2.binary_representation is None:
             new_elt.binary_representation = elt1.binary_representation or elt2.binary_representation
@@ -530,7 +558,7 @@ def customTableHeaderAdditionFilter(self, elements):
                 continue
             elif ele.type in ["Text", "Title", "Page-header", "Section-header", "Caption"]:
                 if ele.text_representation is not None:
-                    text_rep = ele.text_representation.strip().lower()
+                    text_rep = ele.text_representation.strip()
                 if text_rep == "":
                     continue
                 if re.search(self.regex_pattern, text_rep):
@@ -541,19 +569,24 @@ def customTableHeaderAdditionFilter(self, elements):
         for ele in elements:
             if ele.type == "table" and isinstance(ele["table"], Table):
                 ele.text_representation = dic[ele["properties"]["page_number"]] + ele.text_representation
-                ele["properties"]["table_header"] = dic[ele["properties"]["page_number"]]
-
+                if ele["properties"]["title"]:
+                    ele["properties"]["title"] = (
+                        ele["properties"]["title"] + "\n" + dic[ele["properties"]["page_number"]]
+                    )
+                else:
+                    ele["properties"]["title"] = dic[ele["properties"]["page_number"]]
         return elements
 
     def process_llm_query(self, document):
+        # TO-DO: Add async llm query
         llm_query_agent = LLMTextQueryAgent(prompt=self.llm_prompt, element_type="table", llm=self.llm, table_cont=True)
         llm_results = llm_query_agent.execute_query(document)
         return llm_results
 
-    def preprocess_element(self, elem: Element) -> Element:
+    def preprocess_element(self, elem: TableElement) -> TableElement:
         return elem
 
-    def postprocess_element(self, elem: Element) -> Element:
+    def postprocess_element(self, elem: TableElement) -> TableElement:
         return elem
 
 

From 2a6c22f946faf837b0ee81c6f5bf659e47c123dc Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Tue, 15 Oct 2024 18:14:02 -0700
Subject: [PATCH 08/11] Fixes

---
 lib/sycamore/sycamore/data/__init__.py             | 1 +
 lib/sycamore/sycamore/transforms/merge_elements.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/lib/sycamore/sycamore/data/__init__.py b/lib/sycamore/sycamore/data/__init__.py
index 3cafb8cdb..b02918da8 100644
--- a/lib/sycamore/sycamore/data/__init__.py
+++ b/lib/sycamore/sycamore/data/__init__.py
@@ -21,4 +21,5 @@
     "OpenSearchQuery",
     "OpenSearchQueryResult",
     "Table",
+    "TableCell",
 ]
diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 72507c0a4..91bb295b6 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -12,6 +12,7 @@
 from sycamore.utils.time_trace import timetrace
 from sycamore.transforms.llm_query import LLMTextQueryAgent
 from sycamore.llms import LLM
+from sycamore.utils.bbox_sort import bbox_sort_document
 
 
 class ElementMerger(ABC):
@@ -481,6 +482,8 @@ def merge_elements(self, document: Document) -> Document:
                 new_table_elements.append(element)
         other_elements.extend(new_table_elements)
         document.elements = other_elements
+        bbox_sort_document(document)
+
         return document
 
     def should_merge(self, element1: TableElement, element2: TableElement) -> bool:

From 89f6956ff9920b13207f79ba869e98f76f48b74c Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Tue, 15 Oct 2024 18:29:02 -0700
Subject: [PATCH 09/11] bbox & parent class fix

---
 .../sycamore/transforms/merge_elements.py     | 22 ++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 91bb295b6..30e6180e8 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -434,7 +434,7 @@ class TableMerger(ElementMerger):
             if the second table is a continuation of the first with 100% certainty. Check either of the following:\
             1. Column headers: Must be near identical in terms of text(the ordering/text may contain minor errors \
             because of OCR quality) in both tables. If the headers are almost the same check the number of columns,\
-                 they should be roughly the same.\
+                 they should be roughly the same. \
             2. Missing headers: If the header/columns in the second table are missing, then the first row in the
             second table should logically be in continutaion of the last row in the first table.\
             Respond with only 'true' or 'false' based on your certainty that the second table is a continuation. \
@@ -486,12 +486,12 @@ def merge_elements(self, document: Document) -> Document:
 
         return document
 
-    def should_merge(self, element1: TableElement, element2: TableElement) -> bool:
+    def should_merge(self, element1: Element, element2: Element) -> bool:
         if "table_continuation" in element2["properties"]:
             return "true" in element2["properties"]["table_continuation"].lower()
         return False
 
-    def merge(self, elt1: TableElement, elt2: TableElement) -> TableElement:
+    def merge(self, elt1: Element, elt2: Element) -> Element:
 
         # Combine the cells, adjusting the row indices for the second table
         offset_row = elt1.table.num_rows
@@ -547,6 +547,18 @@ def merge(self, elt1: TableElement, elt2: TableElement) -> TableElement:
                 properties["page_numbers"] = properties.get("page_numbers", list())
                 properties["page_numbers"] = list(set(properties["page_numbers"] + [v]))
 
+        # TO-DO: Currently bbox points to first table bbox, and other bboxs are removed in
+        # this process, potential fix can be to have a list of bboxs, and change label
+        # of bbox after first as "table_continuation"
+        if elt1.bbox is None or elt2.bbox is None:
+            new_elt.bbox = elt1.bbox or elt2.bbox
+        else:
+            new_elt.bbox = BoundingBox(
+                elt1.bbox.x1,
+                elt1.bbox.y1,
+                elt1.bbox.x2,
+                elt1.bbox.y2,
+            )
         new_elt.properties = properties
 
         return new_elt
@@ -586,10 +598,10 @@ def process_llm_query(self, document):
         llm_results = llm_query_agent.execute_query(document)
         return llm_results
 
-    def preprocess_element(self, elem: TableElement) -> TableElement:
+    def preprocess_element(self, elem: Element) -> Element:
         return elem
 
-    def postprocess_element(self, elem: TableElement) -> TableElement:
+    def postprocess_element(self, elem: Element) -> Element:
         return elem
 
 

From ac8afbf49a75234c52367aca80198ef9c06a74e9 Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Tue, 15 Oct 2024 18:44:58 -0700
Subject: [PATCH 10/11] mypy

---
 lib/sycamore/sycamore/transforms/merge_elements.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index 30e6180e8..a70efd4bb 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -493,6 +493,9 @@ def should_merge(self, element1: Element, element2: Element) -> bool:
 
     def merge(self, elt1: Element, elt2: Element) -> Element:
 
+        # Check if both elements are TableElements
+        if not isinstance(elt1, TableElement) or not isinstance(elt2, TableElement):
+            raise TypeError("Both elements must be of type TableElement to perform merging.")
         # Combine the cells, adjusting the row indices for the second table
         offset_row = elt1.table.num_rows
         merged_cells = elt1.table.cells + [

From e8e8e2288c76fd5c1edcbd5dcf4ebc318db1f5f9 Mon Sep 17 00:00:00 2001
From: Dhruv Kaliraman <dhruvkal@aryn.ai>
Date: Tue, 15 Oct 2024 18:56:44 -0700
Subject: [PATCH 11/11] mypy

---
 lib/sycamore/sycamore/transforms/merge_elements.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/sycamore/sycamore/transforms/merge_elements.py b/lib/sycamore/sycamore/transforms/merge_elements.py
index a70efd4bb..1ed1e1ef0 100644
--- a/lib/sycamore/sycamore/transforms/merge_elements.py
+++ b/lib/sycamore/sycamore/transforms/merge_elements.py
@@ -497,6 +497,9 @@ def merge(self, elt1: Element, elt2: Element) -> Element:
         if not isinstance(elt1, TableElement) or not isinstance(elt2, TableElement):
             raise TypeError("Both elements must be of type TableElement to perform merging.")
         # Combine the cells, adjusting the row indices for the second table
+        if elt1.table is None or elt2.table is None:
+            raise ValueError("Both elements must have a table to perform merging.")
+
         offset_row = elt1.table.num_rows
         merged_cells = elt1.table.cells + [
             TableCell(