Skip to content

Huggingface fs

Init params.

HuggingFaceFSReader #

Bases: BaseReader

Hugging Face File System reader.

Uses the new Filesystem API from the Hugging Face Hub client library.

Source code in llama-index-integrations/readers/llama-index-readers-huggingface-fs/llama_index/readers/huggingface_fs/base.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class HuggingFaceFSReader(BaseReader):
    """Hugging Face File System reader.

    Uses the new Filesystem API from the Hugging Face Hub client library.
    """

    def __init__(self) -> None:
        from huggingface_hub import HfFileSystem

        self.fs = HfFileSystem()

    def load_dicts(self, path: str) -> List[Dict]:
        """Parse file."""
        test_data = self.fs.read_bytes(path)

        path = Path(path)
        if ".gz" in path.suffixes:
            import gzip

            with TemporaryDirectory() as tmp:
                tmp = Path(tmp)
                with open(tmp / "tmp.jsonl.gz", "wb") as fp:
                    fp.write(test_data)

                f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
                raw = f.read()
                data = raw.decode()
        else:
            data = test_data.decode()

        text_lines = data.split("\n")
        json_dicts = []
        for t in text_lines:
            try:
                json_dict = json.loads(t)
            except json.decoder.JSONDecodeError:
                continue
            json_dicts.append(json_dict)
        return json_dicts

    def load_df(self, path: str) -> pd.DataFrame:
        """Load pandas dataframe."""
        return pd.DataFrame(self.load_dicts(path))

    def load_data(self, path: str) -> List[Document]:
        """Load data."""
        json_dicts = self.load_dicts(path)
        docs = []
        for d in json_dicts:
            docs.append(Document(text=str(d)))
        return docs

load_dicts #

load_dicts(path: str) -> List[Dict]

Parse file.

Source code in llama-index-integrations/readers/llama-index-readers-huggingface-fs/llama_index/readers/huggingface_fs/base.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def load_dicts(self, path: str) -> List[Dict]:
    """Parse file."""
    test_data = self.fs.read_bytes(path)

    path = Path(path)
    if ".gz" in path.suffixes:
        import gzip

        with TemporaryDirectory() as tmp:
            tmp = Path(tmp)
            with open(tmp / "tmp.jsonl.gz", "wb") as fp:
                fp.write(test_data)

            f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
            raw = f.read()
            data = raw.decode()
    else:
        data = test_data.decode()

    text_lines = data.split("\n")
    json_dicts = []
    for t in text_lines:
        try:
            json_dict = json.loads(t)
        except json.decoder.JSONDecodeError:
            continue
        json_dicts.append(json_dict)
    return json_dicts

load_df #

load_df(path: str) -> DataFrame

Load pandas dataframe.

Source code in llama-index-integrations/readers/llama-index-readers-huggingface-fs/llama_index/readers/huggingface_fs/base.py
57
58
59
def load_df(self, path: str) -> pd.DataFrame:
    """Load pandas dataframe."""
    return pd.DataFrame(self.load_dicts(path))

load_data #

load_data(path: str) -> List[Document]

Load data.

Source code in llama-index-integrations/readers/llama-index-readers-huggingface-fs/llama_index/readers/huggingface_fs/base.py
61
62
63
64
65
66
67
def load_data(self, path: str) -> List[Document]:
    """Load data."""
    json_dicts = self.load_dicts(path)
    docs = []
    for d in json_dicts:
        docs.append(Document(text=str(d)))
    return docs