Singlestoredb

SingleStoreVectorStore #

Bases: VectorStore

SingleStore vector store.

This vector store stores embeddings within a SingleStore database table.

During query time, the index uses SingleStore to query for the top k most similar nodes.

Parameters:

Name	Type	Description	Default
`table_name`	`str`	Specifies the name of the table in use. Defaults to "embeddings".	`'embeddings'`
`content_field`	`str`	Specifies the field to store the content. Defaults to "content".	`'content'`
`metadata_field`	`str`	Specifies the field to store metadata. Defaults to "metadata".	`'metadata'`
`vector_field`	`str`	Specifies the field to store the vector. Defaults to "vector".	`'vector'`
`Following`	`arguments pertain to the connection pool`		required
`pool_size`	`int`	Determines the number of active connections in the pool. Defaults to 5.	`5`
`max_overflow`	`int`	Determines the maximum number of connections allowed beyond the pool_size. Defaults to 10.	`10`
`timeout`	`float`	Specifies the maximum wait time in seconds for establishing a connection. Defaults to 30.	`30`
`Following`	`arguments pertain to the connection`		required
`host`	`str`	Specifies the hostname, IP address, or URL for the database connection. The default scheme is "mysql".	required
`user`	`str`	Database username.	required
`password`	`str`	Database password.	required
`port`	`int`	Database port. Defaults to 3306 for non-HTTP connections, 80 for HTTP connections, and 443 for HTTPS connections.	required
`database`	`str`	Database name.	required

Examples:

pip install llama-index-vector-stores-singlestoredb

from llama_index.vector_stores.singlestoredb import SingleStoreVectorStore
import os

# can set the singlestore db url in env
# or pass it in as an argument to the SingleStoreVectorStore constructor
os.environ["SINGLESTOREDB_URL"] = "PLACEHOLDER URL"
vector_store = SingleStoreVectorStore(
    table_name="embeddings",
    content_field="content",
    metadata_field="metadata",
    vector_field="vector",
    timeout=30,
)

Source code in

llama-index-integrations/vector_stores/llama-index-vector-stores-singlestoredb/llama_index/vector_stores/singlestoredb/base.py

class SingleStoreVectorStore(VectorStore):
    """SingleStore vector store.

    This vector store stores embeddings within a SingleStore database table.

    During query time, the index uses SingleStore to query for the top
    k most similar nodes.

    Args:
        table_name (str, optional): Specifies the name of the table in use.
                Defaults to "embeddings".
        content_field (str, optional): Specifies the field to store the content.
            Defaults to "content".
        metadata_field (str, optional): Specifies the field to store metadata.
            Defaults to "metadata".
        vector_field (str, optional): Specifies the field to store the vector.
            Defaults to "vector".

        Following arguments pertain to the connection pool:

        pool_size (int, optional): Determines the number of active connections in
            the pool. Defaults to 5.
        max_overflow (int, optional): Determines the maximum number of connections
            allowed beyond the pool_size. Defaults to 10.
        timeout (float, optional): Specifies the maximum wait time in seconds for
            establishing a connection. Defaults to 30.

        Following arguments pertain to the connection:

        host (str, optional): Specifies the hostname, IP address, or URL for the
                database connection. The default scheme is "mysql".
        user (str, optional): Database username.
        password (str, optional): Database password.
        port (int, optional): Database port. Defaults to 3306 for non-HTTP
            connections, 80 for HTTP connections, and 443 for HTTPS connections.
        database (str, optional): Database name.

    Examples:
        `pip install llama-index-vector-stores-singlestoredb`

        ```python
        from llama_index.vector_stores.singlestoredb import SingleStoreVectorStore
        import os

        # can set the singlestore db url in env
        # or pass it in as an argument to the SingleStoreVectorStore constructor
        os.environ["SINGLESTOREDB_URL"] = "PLACEHOLDER URL"
        vector_store = SingleStoreVectorStore(
            table_name="embeddings",
            content_field="content",
            metadata_field="metadata",
            vector_field="vector",
            timeout=30,
        )
        ```

    """

    stores_text: bool = True
    flat_metadata: bool = True

    def __init__(
        self,
        table_name: str = "embeddings",
        content_field: str = "content",
        metadata_field: str = "metadata",
        vector_field: str = "vector",
        pool_size: int = 5,
        max_overflow: int = 10,
        timeout: float = 30,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        self.table_name = table_name
        self.content_field = content_field
        self.metadata_field = metadata_field
        self.vector_field = vector_field
        self.pool_size = pool_size
        self.max_overflow = max_overflow
        self.timeout = timeout

        self.connection_kwargs = kwargs
        self.connection_pool = QueuePool(
            self._get_connection,
            pool_size=self.pool_size,
            max_overflow=self.max_overflow,
            timeout=self.timeout,
        )

        self._create_table()

    @property
    def client(self) -> Any:
        """Return SingleStoreDB client."""
        return self._get_connection()

    @classmethod
    def class_name(cls) -> str:
        return "SingleStoreVectorStore"

    def _get_connection(self) -> Any:
        return s2.connect(**self.connection_kwargs)

    def _create_table(self) -> None:
        conn = self.connection_pool.connect()
        try:
            cur = conn.cursor()
            try:
                cur.execute(
                    f"""CREATE TABLE IF NOT EXISTS {self.table_name}
                    ({self.content_field} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
                    {self.vector_field} BLOB, {self.metadata_field} JSON);"""
                )
            finally:
                cur.close()
        finally:
            conn.close()

    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
        """Add nodes to index.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings

        """
        conn = self.connection_pool.connect()
        cursor = conn.cursor()
        try:
            for node in nodes:
                embedding = node.get_embedding()
                metadata = node_to_metadata_dict(
                    node, remove_text=True, flat_metadata=self.flat_metadata
                )
                cursor.execute(
                    "INSERT INTO {} VALUES (%s, JSON_ARRAY_PACK(%s), %s)".format(
                        self.table_name
                    ),
                    (
                        node.get_content(metadata_mode=MetadataMode.NONE) or "",
                        "[{}]".format(",".join(map(str, embedding))),
                        json.dumps(metadata),
                    ),
                )
        finally:
            cursor.close()
            conn.close()
        return [node.node_id for node in nodes]

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        """
        conn = self.connection_pool.connect()
        cursor = conn.cursor()
        try:
            cursor.execute(
                f"DELETE FROM {self.table_name} WHERE JSON_EXTRACT_JSON(metadata, 'ref_doc_id') = %s",
                ('"' + ref_doc_id + '"',),
            )
        finally:
            cursor.close()
            conn.close()

    def query(
        self, query: VectorStoreQuery, filter: Optional[dict] = None, **kwargs: Any
    ) -> VectorStoreQueryResult:
        """
        Query index for top k most similar nodes.

        Args:
            query (VectorStoreQuery): Contains query_embedding and similarity_top_k attributes.
            filter (Optional[dict]): A dictionary of metadata fields and values to filter by. Defaults to None.

        Returns:
            VectorStoreQueryResult: Contains nodes, similarities, and ids attributes.
        """
        query_embedding = query.query_embedding
        similarity_top_k = query.similarity_top_k
        conn = self.connection_pool.connect()
        where_clause: str = ""
        where_clause_values: List[Any] = []

        if filter:
            where_clause = "WHERE "
            arguments = []

            def build_where_clause(
                where_clause_values: List[Any],
                sub_filter: dict,
                prefix_args: Optional[List[str]] = None,
            ) -> None:
                prefix_args = prefix_args or []
                for key in sub_filter:
                    if isinstance(sub_filter[key], dict):
                        build_where_clause(
                            where_clause_values, sub_filter[key], [*prefix_args, key]
                        )
                    else:
                        arguments.append(
                            "JSON_EXTRACT({}, {}) = %s".format(
                                {self.metadata_field},
                                ", ".join(["%s"] * (len(prefix_args) + 1)),
                            )
                        )
                        where_clause_values += [*prefix_args, key]
                        where_clause_values.append(json.dumps(sub_filter[key]))

            build_where_clause(where_clause_values, filter)
            where_clause += " AND ".join(arguments)

        results: Sequence[Any] = []
        if query_embedding:
            try:
                cur = conn.cursor()
                formatted_vector = "[{}]".format(",".join(map(str, query_embedding)))
                try:
                    logger.debug("vector field: %s", formatted_vector)
                    logger.debug("similarity_top_k: %s", similarity_top_k)
                    cur.execute(
                        f"SELECT {self.content_field}, {self.metadata_field}, "
                        f"DOT_PRODUCT({self.vector_field}, "
                        "JSON_ARRAY_PACK(%s)) as similarity_score "
                        f"FROM {self.table_name} {where_clause} "
                        f"ORDER BY similarity_score DESC LIMIT {similarity_top_k}",
                        (formatted_vector, *tuple(where_clause_values)),
                    )
                    results = cur.fetchall()
                finally:
                    cur.close()
            finally:
                conn.close()

        nodes = []
        similarities = []
        ids = []
        for result in results:
            text, metadata, similarity_score = result
            node = metadata_dict_to_node(metadata)
            node.set_content(text)
            nodes.append(node)
            similarities.append(similarity_score)
            ids.append(node.node_id)

        return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)

client `property` #

client: Any

Return SingleStoreDB client.

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

Add nodes to index.

Parameters:

Name	Type	Description	Default
`nodes`	`List[BaseNode]`	List[BaseNode]: list of nodes with embeddings	required

Source code in

llama-index-integrations/vector_stores/llama-index-vector-stores-singlestoredb/llama_index/vector_stores/singlestoredb/base.py

def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
    """Add nodes to index.

    Args:
        nodes: List[BaseNode]: list of nodes with embeddings

    """
    conn = self.connection_pool.connect()
    cursor = conn.cursor()
    try:
        for node in nodes:
            embedding = node.get_embedding()
            metadata = node_to_metadata_dict(
                node, remove_text=True, flat_metadata=self.flat_metadata
            )
            cursor.execute(
                "INSERT INTO {} VALUES (%s, JSON_ARRAY_PACK(%s), %s)".format(
                    self.table_name
                ),
                (
                    node.get_content(metadata_mode=MetadataMode.NONE) or "",
                    "[{}]".format(",".join(map(str, embedding))),
                    json.dumps(metadata),
                ),
            )
    finally:
        cursor.close()
        conn.close()
    return [node.node_id for node in nodes]

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

Delete nodes using with ref_doc_id.

Parameters:

Name	Type	Description	Default
`ref_doc_id`	`str`	The doc_id of the document to delete.	required

Source code in

llama-index-integrations/vector_stores/llama-index-vector-stores-singlestoredb/llama_index/vector_stores/singlestoredb/base.py

def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
    """
    Delete nodes using with ref_doc_id.

    Args:
        ref_doc_id (str): The doc_id of the document to delete.

    """
    conn = self.connection_pool.connect()
    cursor = conn.cursor()
    try:
        cursor.execute(
            f"DELETE FROM {self.table_name} WHERE JSON_EXTRACT_JSON(metadata, 'ref_doc_id') = %s",
            ('"' + ref_doc_id + '"',),
        )
    finally:
        cursor.close()
        conn.close()

query #

query(query: VectorStoreQuery, filter: Optional[dict] = None, **kwargs: Any) -> VectorStoreQueryResult

Query index for top k most similar nodes.

Parameters:

Name	Type	Description	Default
`query`	`VectorStoreQuery`	Contains query_embedding and similarity_top_k attributes.	required
`filter`	`Optional[dict]`	A dictionary of metadata fields and values to filter by. Defaults to None.	`None`

Returns:

Name	Type	Description
`VectorStoreQueryResult`	`VectorStoreQueryResult`	Contains nodes, similarities, and ids attributes.

Source code in

llama-index-integrations/vector_stores/llama-index-vector-stores-singlestoredb/llama_index/vector_stores/singlestoredb/base.py

def query(
    self, query: VectorStoreQuery, filter: Optional[dict] = None, **kwargs: Any
) -> VectorStoreQueryResult:
    """
    Query index for top k most similar nodes.

    Args:
        query (VectorStoreQuery): Contains query_embedding and similarity_top_k attributes.
        filter (Optional[dict]): A dictionary of metadata fields and values to filter by. Defaults to None.

    Returns:
        VectorStoreQueryResult: Contains nodes, similarities, and ids attributes.
    """
    query_embedding = query.query_embedding
    similarity_top_k = query.similarity_top_k
    conn = self.connection_pool.connect()
    where_clause: str = ""
    where_clause_values: List[Any] = []

    if filter:
        where_clause = "WHERE "
        arguments = []

        def build_where_clause(
            where_clause_values: List[Any],
            sub_filter: dict,
            prefix_args: Optional[List[str]] = None,
        ) -> None:
            prefix_args = prefix_args or []
            for key in sub_filter:
                if isinstance(sub_filter[key], dict):
                    build_where_clause(
                        where_clause_values, sub_filter[key], [*prefix_args, key]
                    )
                else:
                    arguments.append(
                        "JSON_EXTRACT({}, {}) = %s".format(
                            {self.metadata_field},
                            ", ".join(["%s"] * (len(prefix_args) + 1)),
                        )
                    )
                    where_clause_values += [*prefix_args, key]
                    where_clause_values.append(json.dumps(sub_filter[key]))

        build_where_clause(where_clause_values, filter)
        where_clause += " AND ".join(arguments)

    results: Sequence[Any] = []
    if query_embedding:
        try:
            cur = conn.cursor()
            formatted_vector = "[{}]".format(",".join(map(str, query_embedding)))
            try:
                logger.debug("vector field: %s", formatted_vector)
                logger.debug("similarity_top_k: %s", similarity_top_k)
                cur.execute(
                    f"SELECT {self.content_field}, {self.metadata_field}, "
                    f"DOT_PRODUCT({self.vector_field}, "
                    "JSON_ARRAY_PACK(%s)) as similarity_score "
                    f"FROM {self.table_name} {where_clause} "
                    f"ORDER BY similarity_score DESC LIMIT {similarity_top_k}",
                    (formatted_vector, *tuple(where_clause_values)),
                )
                results = cur.fetchall()
            finally:
                cur.close()
        finally:
            conn.close()

    nodes = []
    similarities = []
    ids = []
    for result in results:
        text, metadata, similarity_score = result
        node = metadata_dict_to_node(metadata)
        node.set_content(text)
        nodes.append(node)
        similarities.append(similarity_score)
        ids.append(node.node_id)

    return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)

Singlestoredb

SingleStoreVectorStore #

client property #

add #

delete #

query #

client `property` #