Skip to content

Marvin

MarvinMetadataExtractor #

Bases: BaseExtractor

Source code in llama-index-integrations/extractors/llama-index-extractors-marvin/llama_index/extractors/marvin/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
class MarvinMetadataExtractor(BaseExtractor):
    # Forward reference to handle circular imports
    marvin_model: Type["ai_model"] = Field(
        description="The Marvin model to use for extracting custom metadata"
    )
    llm_model_string: Optional[str] = Field(
        description="The LLM model string to use for extracting custom metadata"
    )

    """Metadata extractor for custom metadata using Marvin.
    Node-level extractor. Extracts
    `marvin_metadata` metadata field.
    Args:
        marvin_model: Marvin model to use for extracting metadata
        llm_model_string: (optional) LLM model string to use for extracting metadata
    Usage:
        #create extractor list
        extractors = [
            TitleExtractor(nodes=1, llm=llm),
            MarvinMetadataExtractor(marvin_model=YourMarvinMetadataModel),
        ]

        #create node parser to parse nodes from document
        node_parser = SentenceSplitter(
            text_splitter=text_splitter
        )

        #use node_parser to get nodes from documents
        from llama_index.ingestion import run_transformations
        nodes = run_transformations(documents, [node_parser] + extractors)
        print(nodes)
    """

    def __init__(
        self,
        marvin_model: Type[BaseModel],
        llm_model_string: Optional[str] = None,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        import marvin
        from marvin import ai_model

        if not issubclass(marvin_model, ai_model):
            raise ValueError("marvin_model must be a subclass of ai_model")

        if llm_model_string:
            marvin.settings.llm_model = llm_model_string

        super().__init__(
            marvin_model=marvin_model, llm_model_string=llm_model_string, **kwargs
        )

    @classmethod
    def class_name(cls) -> str:
        return "MarvinEntityExtractor"

    async def aextract(self, nodes: Sequence[BaseNode]) -> List[Dict]:
        from marvin import ai_model

        ai_model = cast(ai_model, self.marvin_model)
        metadata_list: List[Dict] = []

        nodes_queue: Iterable[BaseNode] = get_tqdm_iterable(
            nodes, self.show_progress, "Extracting marvin metadata"
        )
        for node in nodes_queue:
            if self.is_text_node_only and not isinstance(node, TextNode):
                metadata_list.append({})
                continue

            # TODO: Does marvin support async?
            metadata = ai_model(node.get_content())

            metadata_list.append({"marvin_metadata": metadata.dict()})
        return metadata_list

llm_model_string class-attribute instance-attribute #

llm_model_string: Optional[str] = Field(description='The LLM model string to use for extracting custom metadata')

Metadata extractor for custom metadata using Marvin. Node-level extractor. Extracts marvin_metadata metadata field. Args: marvin_model: Marvin model to use for extracting metadata llm_model_string: (optional) LLM model string to use for extracting metadata Usage: #create extractor list extractors = [ TitleExtractor(nodes=1, llm=llm), MarvinMetadataExtractor(marvin_model=YourMarvinMetadataModel), ]

#create node parser to parse nodes from document
node_parser = SentenceSplitter(
    text_splitter=text_splitter
)

#use node_parser to get nodes from documents
from llama_index.ingestion import run_transformations
nodes = run_transformations(documents, [node_parser] + extractors)
print(nodes)