Skip to content

Unstructured element

Node parsers.

UnstructuredElementNodeParser #

Bases: BaseElementNodeParser

Unstructured element node parser.

Splits a document into Text Nodes and Index Nodes corresponding to embedded objects (e.g. tables).

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
class UnstructuredElementNodeParser(BaseElementNodeParser):
    """Unstructured element node parser.

    Splits a document into Text Nodes and Index Nodes corresponding to embedded objects
    (e.g. tables).

    """

    partitioning_parameters: Optional[Dict[str, Any]] = Field(
        default={},
        description="Extra dictionary representing parameters of the partitioning process.",
    )

    def __init__(
        self,
        callback_manager: Optional[CallbackManager] = None,
        llm: Optional[Any] = None,
        summary_query_str: str = DEFAULT_SUMMARY_QUERY_STR,
        partitioning_parameters: Optional[Dict[str, Any]] = {},
    ) -> None:
        """Initialize."""
        try:
            import lxml  # noqa  # pants: no-infer-dep
            import unstructured  # noqa  # pants: no-infer-dep
        except ImportError:
            raise ImportError(
                "You must install the `unstructured` and `lxml` "
                "package to use this node parser."
            )
        callback_manager = callback_manager or CallbackManager([])

        return super().__init__(
            callback_manager=callback_manager,
            llm=llm,
            summary_query_str=summary_query_str,
            partitioning_parameters=partitioning_parameters,
        )

    @classmethod
    def class_name(cls) -> str:
        return "UnstructuredElementNodeParser"

    def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
        """Get nodes from node."""
        elements = self.extract_elements(
            node.get_content(), table_filters=[self.filter_table]
        )
        table_elements = self.get_table_elements(elements)
        # extract summaries over table elements
        self.extract_table_summaries(table_elements)
        # convert into nodes
        # will return a list of Nodes and Index Nodes
        nodes = self.get_nodes_from_elements(elements, node.metadata)

        source_document = node.source_node or node.as_related_node_info()
        for n in nodes:
            n.relationships[NodeRelationship.SOURCE] = source_document
            n.metadata.update(node.metadata)
        return nodes

    def extract_elements(
        self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
    ) -> List[Element]:
        """Extract elements from text."""
        from unstructured.partition.html import partition_html  # pants: no-infer-dep

        table_filters = table_filters or []
        elements = partition_html(text=text, **self.partitioning_parameters)
        output_els = []
        for idx, element in enumerate(elements):
            if "unstructured.documents.html.HTMLTable" in str(type(element)):
                should_keep = all(tf(element) for tf in table_filters)
                if should_keep:
                    table_df = html_to_df(str(element.metadata.text_as_html))
                    output_els.append(
                        Element(
                            id=f"id_{idx}",
                            type="table",
                            element=element,
                            table=table_df,
                        )
                    )
                else:
                    # if not a table, keep it as Text as we don't want to loose context
                    from unstructured.documents.html import HTMLText

                    newElement = HTMLText(str(element), tag=element.tag)
                    output_els.append(
                        Element(id=f"id_{idx}", type="text", element=newElement)
                    )
            else:
                output_els.append(Element(id=f"id_{idx}", type="text", element=element))
        return output_els

    def filter_table(self, table_element: Any) -> bool:
        """Filter tables."""
        table_df = html_to_df(table_element.metadata.text_as_html)

        # check if table_df is not None, has more than one row, and more than one column
        return table_df is not None and not table_df.empty and len(table_df.columns) > 1

get_nodes_from_node #

get_nodes_from_node(node: TextNode) -> List[BaseNode]

Get nodes from node.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def get_nodes_from_node(self, node: TextNode) -> List[BaseNode]:
    """Get nodes from node."""
    elements = self.extract_elements(
        node.get_content(), table_filters=[self.filter_table]
    )
    table_elements = self.get_table_elements(elements)
    # extract summaries over table elements
    self.extract_table_summaries(table_elements)
    # convert into nodes
    # will return a list of Nodes and Index Nodes
    nodes = self.get_nodes_from_elements(elements, node.metadata)

    source_document = node.source_node or node.as_related_node_info()
    for n in nodes:
        n.relationships[NodeRelationship.SOURCE] = source_document
        n.metadata.update(node.metadata)
    return nodes

extract_elements #

extract_elements(text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any) -> List[Element]

Extract elements from text.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def extract_elements(
    self, text: str, table_filters: Optional[List[Callable]] = None, **kwargs: Any
) -> List[Element]:
    """Extract elements from text."""
    from unstructured.partition.html import partition_html  # pants: no-infer-dep

    table_filters = table_filters or []
    elements = partition_html(text=text, **self.partitioning_parameters)
    output_els = []
    for idx, element in enumerate(elements):
        if "unstructured.documents.html.HTMLTable" in str(type(element)):
            should_keep = all(tf(element) for tf in table_filters)
            if should_keep:
                table_df = html_to_df(str(element.metadata.text_as_html))
                output_els.append(
                    Element(
                        id=f"id_{idx}",
                        type="table",
                        element=element,
                        table=table_df,
                    )
                )
            else:
                # if not a table, keep it as Text as we don't want to loose context
                from unstructured.documents.html import HTMLText

                newElement = HTMLText(str(element), tag=element.tag)
                output_els.append(
                    Element(id=f"id_{idx}", type="text", element=newElement)
                )
        else:
            output_els.append(Element(id=f"id_{idx}", type="text", element=element))
    return output_els

filter_table #

filter_table(table_element: Any) -> bool

Filter tables.

Source code in llama-index-core/llama_index/core/node_parser/relational/unstructured_element.py
112
113
114
115
116
117
def filter_table(self, table_element: Any) -> bool:
    """Filter tables."""
    table_df = html_to_df(table_element.metadata.text_as_html)

    # check if table_df is not None, has more than one row, and more than one column
    return table_df is not None and not table_df.empty and len(table_df.columns) > 1