Arcana.Chunker

The Arcana.Chunker behaviour allows you to implement custom text chunking strategies. Arcana ships with a default implementation using the text_chunker library, but you can create chunkers that split text based on semantic boundaries, document structure, or any custom logic.

Required Callbacks

chunk/2

Splits text into chunks suitable for embedding and retrieval.

text

String.t()

required

The text to split into chunks

opts

keyword()

Options passed from configuration or at call time. Common options:

:chunk_size - Maximum chunk size (default varies by implementation)
:chunk_overlap - Overlap between consecutive chunks
:format - Text format hint (:plaintext, :markdown, :elixir, etc.)
:size_unit - How to measure size: :characters or :tokens

Returns: A list of chunk maps, each containing:

:text - The chunk text content (required)
:chunk_index - Zero-based index of this chunk (required)
:token_count - Estimated token count (required)
Additional keys may be included and will be passed through to storage

Configuration

Configure your chunker in config/config.exs:

# Built-in: Default text chunker
config :arcana, chunker: :default
config :arcana, chunker: {:default, chunk_size: 512, chunk_overlap: 100}

# Custom module
config :arcana, chunker: MyApp.SemanticChunker
config :arcana, chunker: {MyApp.SemanticChunker, model: "..."}

# Custom function
config :arcana, chunker: fn text, opts -> 
  [%{text: text, chunk_index: 0, token_count: 10}]
end

Implementation Examples

Semantic Chunking

Split text based on semantic boundaries using sentence embeddings:

defmodule MyApp.SemanticChunker do
  @behaviour Arcana.Chunker

  @impl true
  def chunk(text, opts) do
    max_tokens = Keyword.get(opts, :chunk_size, 450)
    
    # Split into sentences
    sentences = split_sentences(text)
    
    # Embed each sentence
    embeddings = Enum.map(sentences, &embed_sentence/1)
    
    # Group sentences with high semantic similarity
    chunks = group_by_similarity(sentences, embeddings, max_tokens)
    
    # Format as chunk maps
    chunks
    |> Enum.with_index()
    |> Enum.map(fn {chunk_text, index} ->
      %{
        text: chunk_text,
        chunk_index: index,
        token_count: estimate_tokens(chunk_text)
      }
    end)
  end

  defp split_sentences(text) do
    # Use regex or NLP library to split on sentence boundaries
    text
    |> String.split(~r/[.!?]\s+/)
    |> Enum.reject(&(&1 == ""))
  end

  defp embed_sentence(sentence) do
    # Use lightweight sentence embedder
    {:ok, embedding} = MyApp.SentenceEmbedder.embed(sentence)
    embedding
  end

  defp group_by_similarity(sentences, embeddings, max_tokens) do
    # Group consecutive sentences if they're semantically similar
    # and under max_tokens threshold
    # ... implementation details ...
  end

  defp estimate_tokens(text) do
    max(1, div(String.length(text), 4))
  end
end

Document Structure Chunking

Split Markdown based on document structure:

defmodule MyApp.MarkdownChunker do
  @behaviour Arcana.Chunker

  @impl true
  def chunk(text, opts) do
    max_tokens = Keyword.get(opts, :chunk_size, 450)
    overlap = Keyword.get(opts, :chunk_overlap, 50)
    
    # Parse markdown into sections by headers
    sections = parse_markdown_sections(text)
    
    # Split large sections
    chunks =
      Enum.flat_map(sections, fn section ->
        if section.token_count > max_tokens do
          split_section(section, max_tokens, overlap)
        else
          [section]
        end
      end)
    
    # Add chunk indices
    chunks
    |> Enum.with_index()
    |> Enum.map(fn {chunk, index} ->
      Map.put(chunk, :chunk_index, index)
    end)
  end

  defp parse_markdown_sections(text) do
    # Split by markdown headers (##, ###, etc.)
    text
    |> String.split(~r/^#{1,6}\s+/m)
    |> Enum.reject(&(&1 == ""))
    |> Enum.map(fn section_text ->
      %{
        text: String.trim(section_text),
        token_count: estimate_tokens(section_text),
        chunk_index: 0  # Will be updated later
      }
    end)
  end

  defp split_section(section, max_tokens, overlap) do
    # Split large sections into smaller chunks with overlap
    # ... implementation details ...
  end

  defp estimate_tokens(text) do
    max(1, div(String.length(text), 4))
  end
end

Keyword-Based Chunking

Split based on topic keywords:

defmodule MyApp.KeywordChunker do
  @behaviour Arcana.Chunker

  @impl true
  def chunk(text, opts) do
    keywords = Keyword.get(opts, :keywords, [])
    max_tokens = Keyword.get(opts, :chunk_size, 450)
    
    # Split text on keyword boundaries
    chunks =
      if Enum.empty?(keywords) do
        # Fall back to simple splitting
        simple_chunk(text, max_tokens)
      else
        # Split on keyword occurrences
        split_on_keywords(text, keywords, max_tokens)
      end
    
    # Add metadata
    chunks
    |> Enum.with_index()
    |> Enum.map(fn {chunk_text, index} ->
      %{
        text: chunk_text,
        chunk_index: index,
        token_count: estimate_tokens(chunk_text),
        keywords: extract_keywords(chunk_text, keywords)
      }
    end)
  end

  defp split_on_keywords(text, keywords, max_tokens) do
    # Split text where keywords appear
    # ... implementation details ...
  end

  defp extract_keywords(text, keywords) do
    keywords
    |> Enum.filter(fn keyword ->
      String.contains?(String.downcase(text), String.downcase(keyword))
    end)
  end

  defp estimate_tokens(text) do
    max(1, div(String.length(text), 4))
  end
end

Built-in Implementation: Default Chunker

The built-in default chunker uses the text_chunker library:

defmodule Arcana.Chunker.Default do
  @behaviour Arcana.Chunker

  @default_chunk_size 450
  @default_chunk_overlap 50
  @default_format :plaintext
  @default_size_unit :tokens

  @impl true
  def chunk(text, opts \\ [])

  def chunk("", _opts), do: []

  def chunk(text, opts) do
    chunk_size = Keyword.get(opts, :chunk_size, @default_chunk_size)
    chunk_overlap = Keyword.get(opts, :chunk_overlap, @default_chunk_overlap)
    format = Keyword.get(opts, :format, @default_format)
    size_unit = Keyword.get(opts, :size_unit, @default_size_unit)

    # Convert token-based sizes to character-based
    {effective_chunk_size, effective_overlap} =
      case size_unit do
        :tokens -> {chunk_size * 4, chunk_overlap * 4}
        :characters -> {chunk_size, chunk_overlap}
      end

    text_chunker_opts = [
      chunk_size: effective_chunk_size,
      chunk_overlap: effective_overlap,
      format: format
    ]

    text
    |> TextChunker.split(text_chunker_opts)
    |> Enum.map(& &1.text)
    |> Enum.reject(&blank?/1)
    |> Enum.with_index()
    |> Enum.map(fn {text, index} ->
      %{
        text: text,
        chunk_index: index,
        token_count: estimate_tokens(text)
      }
    end)
  end

  defp estimate_tokens(text) do
    # Rough estimate: ~4 chars per token for English
    max(1, div(String.length(text), 4))
  end
end

Usage in Code

You can override the configured chunker at call time:

# Use global config
Arcana.ingest(text, repo: MyApp.Repo)

# Override with custom chunker
Arcana.ingest(text,
  repo: MyApp.Repo,
  chunker: {MyApp.SemanticChunker, model: "sentence-transformers"}
)

# Override with custom options
Arcana.ingest(markdown_text,
  repo: MyApp.Repo,
  chunker: {:default, chunk_size: 512, format: :markdown}
)

Chunk Format Requirements

Every chunk map must include:

:text (String.t()) - The chunk text content
:chunk_index (integer) - Zero-based index (0, 1, 2, …)
:token_count (integer) - Estimated token count

You can add additional keys for metadata:

%{
  text: "chunk content",
  chunk_index: 0,
  token_count: 45,
  # Additional metadata
  section: "Introduction",
  keywords: ["elixir", "programming"],
  importance: 0.8
}

This metadata is stored alongside the chunk and returned in search results.

Core API

Agent Pipeline

GraphRAG

Extensibility

Arcana.Chunker

Required Callbacks

chunk/2

Configuration

Implementation Examples

Semantic Chunking

Document Structure Chunking

Keyword-Based Chunking

Built-in Implementation: Default Chunker

Usage in Code

Chunk Format Requirements

See Also

Build docs developers (and LLMs) love

Core API

Agent Pipeline

GraphRAG

Extensibility

​Required Callbacks

​chunk/2

​Configuration

​Implementation Examples

​Semantic Chunking

​Document Structure Chunking

​Keyword-Based Chunking

​Built-in Implementation: Default Chunker

​Usage in Code

​Chunk Format Requirements

​See Also

Build docs developers (and LLMs) love

Required Callbacks

chunk/2

Configuration

Implementation Examples

Semantic Chunking

Document Structure Chunking

Keyword-Based Chunking

Built-in Implementation: Default Chunker

Usage in Code

Chunk Format Requirements

See Also