Skip to main content

Overview

Document loaders load data from various sources into Document objects. They provide a standard interface for loading and optionally splitting documents.

BaseLoader

Abstract interface for document loaders. Source: langchain_core.document_loaders.base:26

Core Methods

load

def load(self) -> list[Document]
Load data into Document objects.
return
list[Document]
List of loaded documents
Note: Prefer implementing lazy_load instead. This method is provided for convenience and calls lazy_load under the hood.

aload

async def aload(self) -> list[Document]
Async version of load.
return
list[Document]
List of loaded documents

lazy_load

def lazy_load(self) -> Iterator[Document]
Lazy loader for documents. Implement this method in subclasses. Generators avoid loading all documents into memory at once, which is important for large datasets.
return
Iterator[Document]
Iterator yielding documents one at a time

alazy_load

async def alazy_load(self) -> AsyncIterator[Document]
Async lazy loader for documents.
return
AsyncIterator[Document]
Async iterator yielding documents

load_and_split

def load_and_split(
    self,
    text_splitter: TextSplitter | None = None
) -> list[Document]
Load documents and split into chunks.
text_splitter
TextSplitter | None
Text splitter to use. Defaults to RecursiveCharacterTextSplitter if None.
return
list[Document]
List of split document chunks
Raises:
  • ImportError: If langchain-text-splitters not installed and no text_splitter provided
Deprecated: Do not override this method. It will be deprecated in future versions.

Implementation Example

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document

class CustomLoader(BaseLoader):
    """Load documents from a custom source."""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
    
    def lazy_load(self) -> Iterator[Document]:
        """Lazy load documents from file."""
        with open(self.file_path, 'r') as f:
            for i, line in enumerate(f):
                yield Document(
                    page_content=line.strip(),
                    metadata={"line": i, "source": self.file_path}
                )

# Usage
loader = CustomLoader("data.txt")
docs = loader.load()  # Loads all at once
# or
for doc in loader.lazy_load():  # Loads one at a time
    process(doc)

BaseBlobParser

Abstract interface for parsing blobs into documents. Source: langchain_core.document_loaders.base:117 A blob parser provides a way to parse raw data stored in a Blob into one or more Document objects. Parsers can be composed with blob loaders.

Methods

lazy_parse

def lazy_parse(self, blob: Blob) -> Iterator[Document]
Lazy parsing interface. Must be implemented by subclasses.
blob
Blob
required
The blob to parse
return
Iterator[Document]
Generator of documents

parse

def parse(self, blob: Blob) -> list[Document]
Eagerly parse the blob into documents.
blob
Blob
required
The blob to parse
return
list[Document]
List of parsed documents
Note: This is a convenience method for interactive development. Production code should use lazy_parse.

Implementation Example

from langchain_core.document_loaders import BaseBlobParser
from langchain_core.documents import Document, Blob
import json

class JSONParser(BaseBlobParser):
    """Parse JSON blobs into documents."""
    
    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Parse JSON blob."""
        data = json.loads(blob.as_string())
        
        if isinstance(data, list):
            for item in data:
                yield Document(
                    page_content=json.dumps(item),
                    metadata={"source": blob.source}
                )
        else:
            yield Document(
                page_content=json.dumps(data),
                metadata={"source": blob.source}
            )

# Usage
parser = JSONParser()
blob = Blob.from_path("data.json")
docs = parser.parse(blob)

Document

Class representing a document with content and metadata. Source: langchain_core.documents

Properties

page_content
str
required
The text content of the document
metadata
dict[str, Any]
default:"{}"
Metadata associated with the document. Common keys:
  • source: Source of the document
  • page: Page number
  • chunk: Chunk index
id
str | None
default:"None"
Optional unique identifier

Constructor

def __init__(
    self,
    page_content: str,
    *,
    metadata: dict[str, Any] | None = None,
    id: str | None = None
)
page_content
str
required
Document content
metadata
dict | None
Document metadata
id
str | None
Document ID

Example

from langchain_core.documents import Document

doc = Document(
    page_content="LangChain is a framework for developing applications powered by language models.",
    metadata={
        "source": "https://langchain.com",
        "title": "LangChain Overview",
        "date": "2024-01-01"
    },
    id="doc_123"
)

print(doc.page_content)
print(doc.metadata["source"])

Blob

Represents raw data blob with optional metadata. Source: langchain_core.documents.base Blobs are used to represent binary or text data from various sources (files, URLs, etc.).

Properties

data
bytes | str | None
The raw data
mimetype
str | None
MIME type of the data
encoding
str
default:"'utf-8'"
Encoding of the data
source
str | None
Source identifier (file path, URL, etc.)

Methods

from_path

@classmethod
def from_path(
    cls,
    path: str | Path,
    *,
    encoding: str = "utf-8",
    mime_type: str | None = None
) -> Blob
Create a blob from a file path.
path
str | Path
required
Path to the file
encoding
str
default:"'utf-8'"
Text encoding
mime_type
str | None
MIME type (auto-detected if None)

from_data

@classmethod
def from_data(
    cls,
    data: bytes | str,
    *,
    encoding: str = "utf-8",
    mime_type: str | None = None,
    source: str | None = None
) -> Blob
Create a blob from raw data.

as_string

def as_string(self) -> str
Get the blob data as a string.

as_bytes

def as_bytes(self) -> bytes
Get the blob data as bytes.

Example

from langchain_core.documents import Blob

# From file
blob = Blob.from_path("document.pdf", mime_type="application/pdf")

# From data
blob = Blob.from_data(
    data="Hello, world!",
    mime_type="text/plain",
    source="inline"
)

print(blob.as_string())

Common Loader Patterns

Text File Loader Pattern

class TextFileLoader(BaseLoader):
    def __init__(self, file_path: str, encoding: str = "utf-8"):
        self.file_path = file_path
        self.encoding = encoding
    
    def lazy_load(self) -> Iterator[Document]:
        with open(self.file_path, 'r', encoding=self.encoding) as f:
            yield Document(
                page_content=f.read(),
                metadata={"source": self.file_path}
            )

Directory Loader Pattern

class DirectoryLoader(BaseLoader):
    def __init__(self, path: str, glob: str = "**/*"):
        self.path = Path(path)
        self.glob = glob
    
    def lazy_load(self) -> Iterator[Document]:
        for file_path in self.path.glob(self.glob):
            if file_path.is_file():
                with open(file_path, 'r') as f:
                    yield Document(
                        page_content=f.read(),
                        metadata={"source": str(file_path)}
                    )

Multi-Document File Loader Pattern

class JSONLinesLoader(BaseLoader):
    """Load JSONL files where each line is a document."""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
    
    def lazy_load(self) -> Iterator[Document]:
        with open(self.file_path, 'r') as f:
            for i, line in enumerate(f):
                data = json.loads(line)
                yield Document(
                    page_content=data.get("text", ""),
                    metadata={
                        "source": self.file_path,
                        "line": i,
                        **data.get("metadata", {})
                    }
                )

Build docs developers (and LLMs) love