Skip to content

S3

S3Reader #

Bases: BasePydanticReader

General reader for any S3 file or directory.

If key is not set, the entire bucket (filtered by prefix) is parsed.

Args: bucket (str): the name of your S3 bucket key (Optional[str]): the name of the specific file. If none is provided, this loader will iterate through the entire bucket. prefix (Optional[str]): the prefix to filter by in the case that the loader iterates through the entire bucket. Defaults to empty string. recursive (bool): Whether to recursively search in subdirectories. True by default. file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that file to text. See SimpleDirectoryReader for more details. required_exts (Optional[List[str]]): List of required extensions. Default is None. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. file_metadata (Optional[Callable[str, Dict]]): A function that takes in a filename and returns a Dict of metadata for the Document. Default is None. aws_access_id (Optional[str]): provide AWS access key directly. aws_access_secret (Optional[str]): provide AWS access key directly. s3_endpoint_url (Optional[str]): provide S3 endpoint URL directly.

Source code in llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class S3Reader(BasePydanticReader):
    """
    General reader for any S3 file or directory.

    If key is not set, the entire bucket (filtered by prefix) is parsed.

    Args:
    bucket (str): the name of your S3 bucket
    key (Optional[str]): the name of the specific file. If none is provided,
        this loader will iterate through the entire bucket.
    prefix (Optional[str]): the prefix to filter by in the case that the loader
        iterates through the entire bucket. Defaults to empty string.
    recursive (bool): Whether to recursively search in subdirectories.
        True by default.
    file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
        extension to a BaseReader class that specifies how to convert that file
        to text. See `SimpleDirectoryReader` for more details.
    required_exts (Optional[List[str]]): List of required extensions.
        Default is None.
    num_files_limit (Optional[int]): Maximum number of files to read.
        Default is None.
    file_metadata (Optional[Callable[str, Dict]]): A function that takes
        in a filename and returns a Dict of metadata for the Document.
        Default is None.
    aws_access_id (Optional[str]): provide AWS access key directly.
    aws_access_secret (Optional[str]): provide AWS access key directly.
    s3_endpoint_url (Optional[str]): provide S3 endpoint URL directly.
    """

    is_remote: bool = True

    bucket: str
    key: Optional[str] = None
    prefix: Optional[str] = ""
    recursive: bool = True
    file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = Field(
        default=None, exclude=True
    )
    required_exts: Optional[List[str]] = None
    filename_as_id: bool = True
    num_files_limit: Optional[int] = None
    file_metadata: Optional[Callable[[str], Dict]] = Field(default=None, exclude=True)
    aws_access_id: Optional[str] = None
    aws_access_secret: Optional[str] = None
    aws_session_token: Optional[str] = None
    s3_endpoint_url: Optional[str] = None
    custom_reader_path: Optional[str] = None

    @classmethod
    def class_name(cls) -> str:
        return "S3Reader"

    def load_s3_files_as_docs(self, temp_dir=None) -> List[Document]:
        """Load file(s) from S3."""
        from s3fs import S3FileSystem

        s3fs = S3FileSystem(
            key=self.aws_access_id,
            endpoint_url=self.s3_endpoint_url,
            secret=self.aws_access_secret,
            token=self.aws_session_token,
        )

        input_dir = self.bucket
        input_files = None

        if self.key:
            input_files = [f"{self.bucket}/{self.key}"]
        elif self.prefix:
            input_dir = f"{input_dir}/{self.prefix}"

        loader = SimpleDirectoryReader(
            input_dir=input_dir,
            input_files=input_files,
            file_extractor=self.file_extractor,
            required_exts=self.required_exts,
            filename_as_id=self.filename_as_id,
            num_files_limit=self.num_files_limit,
            file_metadata=self.file_metadata,
            recursive=self.recursive,
            fs=s3fs,
        )

        return loader.load_data()

    def load_data(self, custom_temp_subdir: str = None) -> List[Document]:
        """
        Load the file(s) from S3.

        Args:
            custom_temp_subdir (str, optional): This parameter is deprecated and unused. Defaults to None.

        Returns:
            List[Document]: A list of documents loaded from S3.
        """
        if custom_temp_subdir is not None:
            warnings.warn(
                "The `custom_temp_subdir` parameter is deprecated and unused. Please remove it from your code.",
                DeprecationWarning,
            )

        documents = self.load_s3_files_as_docs()
        for doc in documents:
            if self.s3_endpoint_url:
                doc.id_ = self.s3_endpoint_url + "_" + doc.id_
            else:
                doc.id_ = "s3_" + doc.id_

        return documents

load_s3_files_as_docs #

load_s3_files_as_docs(temp_dir=None) -> List[Document]

Load file(s) from S3.

Source code in llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def load_s3_files_as_docs(self, temp_dir=None) -> List[Document]:
    """Load file(s) from S3."""
    from s3fs import S3FileSystem

    s3fs = S3FileSystem(
        key=self.aws_access_id,
        endpoint_url=self.s3_endpoint_url,
        secret=self.aws_access_secret,
        token=self.aws_session_token,
    )

    input_dir = self.bucket
    input_files = None

    if self.key:
        input_files = [f"{self.bucket}/{self.key}"]
    elif self.prefix:
        input_dir = f"{input_dir}/{self.prefix}"

    loader = SimpleDirectoryReader(
        input_dir=input_dir,
        input_files=input_files,
        file_extractor=self.file_extractor,
        required_exts=self.required_exts,
        filename_as_id=self.filename_as_id,
        num_files_limit=self.num_files_limit,
        file_metadata=self.file_metadata,
        recursive=self.recursive,
        fs=s3fs,
    )

    return loader.load_data()

load_data #

load_data(custom_temp_subdir: str = None) -> List[Document]

Load the file(s) from S3.

Parameters:

Name Type Description Default
custom_temp_subdir str

This parameter is deprecated and unused. Defaults to None.

None

Returns:

Type Description
List[Document]

List[Document]: A list of documents loaded from S3.

Source code in llama-index-integrations/readers/llama-index-readers-s3/llama_index/readers/s3/base.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def load_data(self, custom_temp_subdir: str = None) -> List[Document]:
    """
    Load the file(s) from S3.

    Args:
        custom_temp_subdir (str, optional): This parameter is deprecated and unused. Defaults to None.

    Returns:
        List[Document]: A list of documents loaded from S3.
    """
    if custom_temp_subdir is not None:
        warnings.warn(
            "The `custom_temp_subdir` parameter is deprecated and unused. Please remove it from your code.",
            DeprecationWarning,
        )

    documents = self.load_s3_files_as_docs()
    for doc in documents:
        if self.s3_endpoint_url:
            doc.id_ = self.s3_endpoint_url + "_" + doc.id_
        else:
            doc.id_ = "s3_" + doc.id_

    return documents