Overview
Document loaders load data from various sources into Document objects. They provide a standard interface for loading and optionally splitting documents.
BaseLoader
Abstract interface for document loaders.
Source: langchain_core.document_loaders.base:26
Core Methods
load
def load(self) -> list[Document]
Load data into Document objects.
Note: Prefer implementing lazy_load instead. This method is provided for convenience and calls lazy_load under the hood.
aload
async def aload(self) -> list[Document]
Async version of load.
lazy_load
def lazy_load(self) -> Iterator[Document]
Lazy loader for documents. Implement this method in subclasses.
Generators avoid loading all documents into memory at once, which is important for large datasets.
Iterator yielding documents one at a time
alazy_load
async def alazy_load(self) -> AsyncIterator[Document]
Async lazy loader for documents.
Async iterator yielding documents
load_and_split
def load_and_split(
self,
text_splitter: TextSplitter | None = None
) -> list[Document]
Load documents and split into chunks.
Text splitter to use. Defaults to RecursiveCharacterTextSplitter if None.
List of split document chunks
Raises:
ImportError: If langchain-text-splitters not installed and no text_splitter provided
Deprecated: Do not override this method. It will be deprecated in future versions.
Implementation Example
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
class CustomLoader(BaseLoader):
"""Load documents from a custom source."""
def __init__(self, file_path: str):
self.file_path = file_path
def lazy_load(self) -> Iterator[Document]:
"""Lazy load documents from file."""
with open(self.file_path, 'r') as f:
for i, line in enumerate(f):
yield Document(
page_content=line.strip(),
metadata={"line": i, "source": self.file_path}
)
# Usage
loader = CustomLoader("data.txt")
docs = loader.load() # Loads all at once
# or
for doc in loader.lazy_load(): # Loads one at a time
process(doc)
BaseBlobParser
Abstract interface for parsing blobs into documents.
Source: langchain_core.document_loaders.base:117
A blob parser provides a way to parse raw data stored in a Blob into one or more Document objects. Parsers can be composed with blob loaders.
Methods
lazy_parse
def lazy_parse(self, blob: Blob) -> Iterator[Document]
Lazy parsing interface. Must be implemented by subclasses.
parse
def parse(self, blob: Blob) -> list[Document]
Eagerly parse the blob into documents.
Note: This is a convenience method for interactive development. Production code should use lazy_parse.
Implementation Example
from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document, Blob
import json
class JSONParser(BaseBlobParser):
"""Parse JSON blobs into documents."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Parse JSON blob."""
data = json.loads(blob.as_string())
if isinstance(data, list):
for item in data:
yield Document(
page_content=json.dumps(item),
metadata={"source": blob.source}
)
else:
yield Document(
page_content=json.dumps(data),
metadata={"source": blob.source}
)
# Usage
parser = JSONParser()
blob = Blob.from_path("data.json")
docs = parser.parse(blob)
Document
Class representing a document with content and metadata.
Source: langchain_core.documents
Properties
The text content of the document
metadata
dict[str, Any]
default:"{}"
Metadata associated with the document. Common keys:
source: Source of the document
page: Page number
chunk: Chunk index
Optional unique identifier
Constructor
def __init__(
self,
page_content: str,
*,
metadata: dict[str, Any] | None = None,
id: str | None = None
)
Example
from langchain_core.documents import Document
doc = Document(
page_content="LangChain is a framework for developing applications powered by language models.",
metadata={
"source": "https://langchain.com",
"title": "LangChain Overview",
"date": "2024-01-01"
},
id="doc_123"
)
print(doc.page_content)
print(doc.metadata["source"])
Blob
Represents raw data blob with optional metadata.
Source: langchain_core.documents.base
Blobs are used to represent binary or text data from various sources (files, URLs, etc.).
Properties
Source identifier (file path, URL, etc.)
Methods
from_path
@classmethod
def from_path(
cls,
path: str | Path,
*,
encoding: str = "utf-8",
mime_type: str | None = None
) -> Blob
Create a blob from a file path.
MIME type (auto-detected if None)
from_data
@classmethod
def from_data(
cls,
data: bytes | str,
*,
encoding: str = "utf-8",
mime_type: str | None = None,
source: str | None = None
) -> Blob
Create a blob from raw data.
as_string
def as_string(self) -> str
Get the blob data as a string.
as_bytes
def as_bytes(self) -> bytes
Get the blob data as bytes.
Example
from langchain_core.documents import Blob
# From file
blob = Blob.from_path("document.pdf", mime_type="application/pdf")
# From data
blob = Blob.from_data(
data="Hello, world!",
mime_type="text/plain",
source="inline"
)
print(blob.as_string())
Common Loader Patterns
Text File Loader Pattern
class TextFileLoader(BaseLoader):
def __init__(self, file_path: str, encoding: str = "utf-8"):
self.file_path = file_path
self.encoding = encoding
def lazy_load(self) -> Iterator[Document]:
with open(self.file_path, 'r', encoding=self.encoding) as f:
yield Document(
page_content=f.read(),
metadata={"source": self.file_path}
)
Directory Loader Pattern
class DirectoryLoader(BaseLoader):
def __init__(self, path: str, glob: str = "**/*"):
self.path = Path(path)
self.glob = glob
def lazy_load(self) -> Iterator[Document]:
for file_path in self.path.glob(self.glob):
if file_path.is_file():
with open(file_path, 'r') as f:
yield Document(
page_content=f.read(),
metadata={"source": str(file_path)}
)
Multi-Document File Loader Pattern
class JSONLinesLoader(BaseLoader):
"""Load JSONL files where each line is a document."""
def __init__(self, file_path: str):
self.file_path = file_path
def lazy_load(self) -> Iterator[Document]:
with open(self.file_path, 'r') as f:
for i, line in enumerate(f):
data = json.loads(line)
yield Document(
page_content=data.get("text", ""),
metadata={
"source": self.file_path,
"line": i,
**data.get("metadata", {})
}
)