PII Masking
!pip install llama-index==0.5.27
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.indices.postprocessor import PIINodePostprocessor, NERPIINodePostprocessor
from llama_index.llm_predictor import StableLMPredictor
from llama_index import ServiceContext, Document, VectorStoreIndex
from llama_index.data_structs import Node
# load documents
text = """
Hello Paulo Santos. The latest statement for your credit card account \
1111-0000-1111-0000 was mailed to 123 Any Street, Seattle, WA 98109.
"""
node = Node(text)
Option 1: Use NER Model for PII Masking
Use a Hugging Face NER model for PII Masking
service_context = ServiceContext.from_defaults()
processor = NERPIINodePostprocessor(service_context=service_context)
new_nodes = processor.postprocess_nodes([node])
# view redacted text
new_nodes[0].get_text()
'Hello [ORG_7]. The latest statement for your credit card account 1111-0000-1111-0000 was mailed to 123 [ORG_109] [LOC_113], [LOC_121], [LOC_130] 98109.'
# get mapping in node_info
# NOTE: this is not sent to the LLM!
new_nodes[0].node_info["__pii_node_info__"]
{'[ORG_7]': 'Paulo Santos',
'[ORG_109]': 'Any',
'[LOC_113]': 'Street',
'[LOC_121]': 'Seattle',
'[LOC_130]': 'WA'}
Option 2: Use LLM for PII Masking
NOTE: You should be using a local LLM model for PII masking. The example shown is using StableLM
service_context = ServiceContext.from_defaults()
processor = PIINodePostprocessor(service_context=service_context)
new_nodes = processor.postprocess_nodes([node])
# view redacted text
new_nodes[0].get_text()
'Hello [NAME]. The latest statement for your credit card account [CREDIT_CARD_NUMBER] was mailed to [ADDRESS].'
# get mapping in node_info
# NOTE: this is not sent to the LLM!
new_nodes[0].node_info["__pii_node_info__"]
{'NAME': 'Paulo Santos',
'CREDIT_CARD_NUMBER': '1111-0000-1111-0000',
'ADDRESS': '123 Any Street, Seattle, WA 98109'}
Feed Nodes to Index
# feed into index
index = VectorStoreIndex.from_documents(new_nodes)
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 30 tokens
> [build_index_from_nodes] Total embedding token usage: 30 tokens
> [build_index_from_nodes] Total embedding token usage: 30 tokens
response = index.query("What address was the statement mailed to?")
print(str(response))
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 72 tokens
> [query] Total LLM token usage: 72 tokens
> [query] Total LLM token usage: 72 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 8 tokens
> [query] Total embedding token usage: 8 tokens
> [query] Total embedding token usage: 8 tokens
[ADDRESS]