Skip to content

Subdoc summary

SubDocSummaryPack #

Bases: BaseLlamaPack

Pack for injecting sub-doc metadata into each chunk.

Source code in llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class SubDocSummaryPack(BaseLlamaPack):
    """Pack for injecting sub-doc metadata into each chunk."""

    def __init__(
        self,
        documents: List[Document],
        parent_chunk_size: int = 8192,
        parent_chunk_overlap: int = 512,
        child_chunk_size: int = 512,
        child_chunk_overlap: int = 32,
        summary_prompt_str: str = DEFAULT_SUMMARY_PROMPT_STR,
        verbose: bool = False,
        embed_model: Optional[BaseEmbedding] = None,
        llm: Optional[LLM] = None,
    ) -> None:
        """Init params."""
        self.parent_chunk_size = parent_chunk_size
        self.child_chunk_size = child_chunk_size

        self.parent_splitter = SentenceSplitter(
            chunk_size=parent_chunk_size, chunk_overlap=parent_chunk_overlap
        )
        self.child_splitter = SentenceSplitter(
            chunk_size=child_chunk_size, chunk_overlap=child_chunk_overlap
        )

        self.summary_prompt_str = summary_prompt_str
        self.embed_model = embed_model
        self.llm = llm

        parent_nodes = self.parent_splitter.get_nodes_from_documents(documents)
        all_child_nodes = []
        # For each parent node, extract the child nodes and print the text
        for idx, parent_node in enumerate(parent_nodes):
            if verbose:
                print_text(
                    f"> Processing parent chunk {idx + 1} of {len(parent_nodes)}\n",
                    color="blue",
                )
            # get summary
            summary_index = SummaryIndex([parent_node])
            summary_query_engine = summary_index.as_query_engine(
                response_mode="tree_summarize"
            )
            parent_summary = summary_query_engine.query(DEFAULT_SUMMARY_PROMPT_STR)
            if verbose:
                print_text(f"Extracted summary: {parent_summary}\n", color="pink")

            # attach summary to all child nodes
            child_nodes = self.child_splitter.get_nodes_from_documents([parent_node])
            for child_node in child_nodes:
                child_node.metadata["context_summary"] = str(parent_summary)

            all_child_nodes.extend(child_nodes)

        # build vector index for child nodes
        self.vector_index = VectorStoreIndex(
            all_child_nodes, embed_model=self.embed_model
        )
        self.vector_retriever = self.vector_index.as_retriever()
        self.vector_query_engine = self.vector_index.as_query_engine(llm=llm)

        self.verbose = verbose

    def get_modules(self) -> Dict[str, Any]:
        """Get modules."""
        return {
            "vector_index": self.vector_index,
            "vector_retriever": self.vector_retriever,
            "vector_query_engine": self.vector_query_engine,
        }

    def run(self, *args: Any, **kwargs: Any) -> Any:
        """Run the pipeline."""
        return self.vector_query_engine.query(*args, **kwargs)

get_modules #

get_modules() -> Dict[str, Any]

Get modules.

Source code in llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
83
84
85
86
87
88
89
def get_modules(self) -> Dict[str, Any]:
    """Get modules."""
    return {
        "vector_index": self.vector_index,
        "vector_retriever": self.vector_retriever,
        "vector_query_engine": self.vector_query_engine,
    }

run #

run(*args: Any, **kwargs: Any) -> Any

Run the pipeline.

Source code in llama-index-packs/llama-index-packs-subdoc-summary/llama_index/packs/subdoc_summary/base.py
91
92
93
def run(self, *args: Any, **kwargs: Any) -> Any:
    """Run the pipeline."""
    return self.vector_query_engine.query(*args, **kwargs)