Skip to main content
The cache package provides a compiler-grade incremental build system using BoltDB for metadata and a content-addressed filesystem store for large artifacts.

Overview

Architecture:
  • BoltDB: Stores metadata (PostMeta, SearchRecord, Dependencies)
  • Content Store: Content-addressed filesystem with two-tier sharding
  • In-Memory LRU: Hot PostMeta data cached with 5-minute TTL
  • BLAKE3 Hashing: Cryptographic content hashing
  • Body Hash Tracking: Separate frontmatter and body hashes for accurate invalidation
  • SSR Artifact Caching: D2 diagrams and LaTeX math pre-rendered and cached
  • Zstd Compression: Fast compression for large artifacts

Opening a Cache

func Open(basePath string, isDev bool) (*Manager, error)
Parameters:
  • basePath - Cache directory (e.g., .kosh-cache)
  • isDev - Enable dev mode (faster, less durable)
Example:
import "github.com/Kush-Singh-26/kosh/builder/cache"

cache, err := cache.Open(".kosh-cache", false)
if err != nil {
    log.Fatal(err)
}
defer cache.Close()

// Verify cache ID (detects config changes)
needsRebuild, err := cache.VerifyCacheID(configHash)
if needsRebuild {
    log.Println("Cache invalidated - full rebuild required")
    cache.SetCacheID(configHash)
}

Core Types

Manager

type Manager struct {
    db       *bolt.DB  // BoltDB instance
    store    *Store    // Content-addressed store
    basePath string    // Cache directory
    cacheID  string    // Config hash for invalidation
    mu       sync.RWMutex
    dirty    map[string]bool // Track dirty posts
    
    // In-memory LRU cache for hot PostMeta data
    memCache    map[string]*memoryCacheEntry
    memCacheMu  sync.RWMutex
    memCacheTTL time.Duration // Default: 5 minutes
}

PostMeta

type PostMeta struct {
    PostID         string                 // Stable post identifier
    Path           string                 // Source file path
    ModTime        int64                  // Last modification time
    ContentHash    string                 // Frontmatter hash (BLAKE3)
    BodyHash       string                 // Body content hash (BLAKE3)
    HTMLHash       string                 // Large HTML content hash
    InlineHTML     []byte                 // Small HTML stored inline (< 32KB)
    TemplateHash   string                 // Template hash for invalidation
    SSRInputHashes []string               // D2/LaTeX input hashes
    Title          string                 // Post title
    Date           time.Time              // Publication date
    Tags           []string               // Post tags
    WordCount      int                    // Word count
    ReadingTime    int                    // Estimated reading time (minutes)
    Description    string                 // Short description
    Link           string                 // Post URL
    Weight         int                    // Sort weight
    Pinned         bool                   // Pin to top
    Draft          bool                   // Draft status
    Meta           map[string]interface{} // Raw frontmatter
    TOC            []models.TOCEntry      // Table of contents
    Version        string                 // Documentation version
}

SearchRecord

type SearchRecord struct {
    Title           string         // Post title
    NormalizedTitle string         // Lowercase title
    Tokens          []string       // Stemmed tokens
    BM25Data        map[string]int // word → frequency
    DocLen          int            // Document length (tokens)
    Content         string         // Plain text content
    NormalizedTags  []string       // Lowercase tags
    Words           []string       // Cached tokenized words
}

SSRArtifact

type SSRArtifact struct {
    Type       string // "d2", "katex"
    InputHash  string // BLAKE3 of input content
    OutputHash string // BLAKE3 of output content
    RefCount   int    // Reference count (for GC)
    Size       int64  // Artifact size
    CreatedAt  int64  // Creation timestamp
    Compressed bool   // Zstd compression flag
}

Reading from Cache

GetPostByPath

func (m *Manager) GetPostByPath(path string) (*PostMeta, error)
Looks up a post by file path (checks in-memory cache first). Example:
post, err := cache.GetPostByPath("content/v4.0/architecture.md")
if err != nil {
    return err
}

if post == nil {
    // Cache miss - need to render
    return renderPost(path)
}

// Check if file changed
info, _ := os.Stat(path)
if info.ModTime().Unix() != post.ModTime {
    // File modified - re-render
    return renderPost(path)
}

// Use cached HTML
html, _ := cache.GetHTMLContent(post)

GetPostByID

func (m *Manager) GetPostByID(postID string) (*PostMeta, error)
Retrieves a post by its PostID. Example:
postID := cache.GeneratePostID(uuid, normalizedPath)
post, err := cache.GetPostByID(postID)

GetHTMLContent

func (m *Manager) GetHTMLContent(post *PostMeta) ([]byte, error)
Retrieves HTML content (inline or from store). Example:
html, err := cache.GetHTMLContent(post)
if err != nil {
    return err
}
fmt.Println("HTML size:", len(html))

GetSearchRecord

func (m *Manager) GetSearchRecord(postID string) (*SearchRecord, error)
Retrieves pre-computed search data.

GetSSRArtifact

func (m *Manager) GetSSRArtifact(ssrType, inputHash string) (*SSRArtifact, error)
Retrieves cached D2/KaTeX artifacts. Example:
artifact, err := cache.GetSSRArtifact("d2", inputHash)
if artifact != nil {
    // Use cached SVG
    svg, _ := cache.GetSSRContent("d2", artifact)
    return string(svg)
}

// Render D2 diagram
svg := renderD2(diagramCode)
cache.StoreSSRArtifact("d2", inputHash, []byte(svg))

Writing to Cache

StorePost

func (m *Manager) StorePost(meta *PostMeta) error
Stores post metadata in cache. Example:
meta := &cache.PostMeta{
    PostID: postID,
    Path: "content/architecture.md",
    ModTime: time.Now().Unix(),
    ContentHash: cache.HashString(frontmatter),
    BodyHash: cache.HashContent([]byte(body)),
    Title: "Architecture Overview",
    InlineHTML: html, // If < 32KB
    Tags: []string{"architecture", "design"},
    Version: "v4.0",
}

if err := cache.StorePost(meta); err != nil {
    return err
}

BatchCommit

func (m *Manager) BatchCommit(posts []EncodedPost) error
Batch commit multiple posts for better performance. Example:
var encodedPosts []cache.EncodedPost

for _, post := range posts {
    metaData, _ := cache.Encode(post.Meta)
    searchData, _ := cache.Encode(post.Search)
    
    encodedPosts = append(encodedPosts, cache.EncodedPost{
        PostID: []byte(post.ID),
        Data: metaData,
        SearchData: searchData,
        Tags: post.Tags,
    })
}

cache.BatchCommit(encodedPosts)

StoreSSRArtifact

func (m *Manager) StoreSSRArtifact(ssrType, inputHash string, content []byte) error
Stores rendered D2 diagrams or LaTeX math. Example:
svg := renderD2Diagram(code)
err := cache.StoreSSRArtifact("d2", inputHash, []byte(svg))

Content-Addressed Store

Store Type

type Store struct {
    basePath string          // Base directory
    encoder  *zstd.Encoder   // Zstd encoder
    decoder  *zstd.Decoder   // Zstd decoder
}

Put

func (s *Store) Put(category string, content []byte) (hash string, ct CompressionType, err error)
Stores content and returns its hash. Compression strategy:
  • < 1KB: No compression (raw)
  • 1KB - 10KB: Fast zstd compression
  • > 10KB: Level 3 zstd compression
Example:
hash, compression, err := cache.Store().Put("html", htmlContent)
fmt.Printf("Stored at hash %s with compression %v\n", hash, compression)

Get

func (s *Store) Get(category string, hash string, compressed bool) ([]byte, error)
Retrieves content by hash. Example:
content, err := cache.Store().Get("html", hash, true)

Sharding

Content is stored with two-tier sharding:
store/
  └── html/
      └── ab/
          └── cd/
              └── abcdef1234567890.zst
Hash abcdef1234567890 → path ab/cd/abcdef1234567890

Cache Invalidation

Body Hash Tracking (Critical)

Posts track two hashes:
  1. ContentHash - Frontmatter hash (title, tags, date)
  2. BodyHash - Body content hash (markdown text)
Why separate hashes? If only ContentHash is used, changing the body without changing frontmatter would result in a silent cache hit with stale HTML. Example invalidation logic:
func shouldRebuild(cached *PostMeta, newFrontmatter, newBody string) bool {
    newContentHash := cache.HashString(newFrontmatter)
    newBodyHash := cache.HashContent([]byte(newBody))
    
    if cached.ContentHash != newContentHash {
        return true // Frontmatter changed
    }
    if cached.BodyHash != newBodyHash {
        return true // Body changed
    }
    return false // Both unchanged - use cache
}

SSR Input Hashes

Tracks hashes of all D2 diagrams and LaTeX expressions:
meta.SSRInputHashes = []string{
    "blake3_hash_of_diagram_1",
    "blake3_hash_of_math_expr_1",
    "blake3_hash_of_math_expr_2",
}
Used for garbage collection and cache validation.

In-Memory LRU Cache

Architecture

type memoryCacheEntry struct {
    meta      *PostMeta   // Cached post
    expiresAt time.Time   // Expiration time
}

// Default TTL: 5 minutes
const defaultMemCacheTTL = 5 * time.Minute

Benefits

  • Faster lookups: Avoid BoltDB read for hot posts
  • Automatic expiration: Stale entries cleaned up
  • Thread-safe: Protected by memCacheMu
Example usage:
// First lookup - BoltDB read
post, _ := cache.GetPostByPath("content/architecture.md") // ~500µs

// Subsequent lookups - memory cache hit
post, _ := cache.GetPostByPath("content/architecture.md") // ~50µs (10x faster)

Garbage Collection

MarkDirty / ClearDirty

func (m *Manager) MarkDirty(postID string)
func (m *Manager) ClearDirty(postID string)
Tracks which posts have been updated.

RunGC

func (m *Manager) RunGC(dryRun bool) (*GCStats, error)
Removes unreferenced SSR artifacts. Example:
// Dry run - see what would be deleted
stats, _ := cache.RunGC(true)
fmt.Printf("Would delete %d artifacts (%d bytes)\n", stats.Deleted, stats.BytesFreed)

// Actually delete
stats, _ := cache.RunGC(false)

Cache Statistics

type CacheStats struct {
    TotalPosts    int   // Total posts in cache
    TotalSSR      int   // Total SSR artifacts
    StoreBytes    int64 // Content store size
    LastGC        int64 // Last GC timestamp
    BuildCount    int   // Total builds
    SchemaVersion int   // Cache schema version
    InlinePosts   int   // Posts with inline HTML
    HashedPosts   int   // Posts with hashed HTML
}
Usage:
stats, _ := cache.GetStats()
fmt.Printf("Cache: %d posts, %d SSR artifacts, %.2f MB\n",
    stats.TotalPosts, stats.TotalSSR, float64(stats.StoreBytes)/1e6)

Hashing Functions

HashContent

func HashContent(data []byte) string
Computes BLAKE3 hash of byte data. Example:
hash := cache.HashContent([]byte("hello world"))
// Returns: "d74981efa70a0c880b8d8c1985d075dbcbf679b99a5f9914e5aaf96b831a9e24"

HashString

func HashString(s string) string
Computes BLAKE3 hash of string.

GeneratePostID

func GeneratePostID(uuid string, normalizedPath string) string
Generates a stable PostID from UUID or path. Example:
// From UUID (if specified in frontmatter)
postID := cache.GeneratePostID("550e8400-e29b-41d4-a716-446655440000", "")

// From path (fallback)
postID := cache.GeneratePostID("", "content/architecture.md")

Performance Optimization

Object Pooling

var encodedPostPool = sync.Pool{
    New: func() interface{} {
        return make([]EncodedPost, 0, 64)
    },
}

posts := encodedPostPool.Get().([]EncodedPost)
defer encodedPostPool.Put(posts[:0])
Reduces allocations in hot paths.

Batch Operations

Group BoltDB writes into single transaction:
func (m *Manager) BatchCommit(posts []EncodedPost) error {
    return m.db.Update(func(tx *bolt.Tx) error {
        for _, post := range posts {
            // All writes in single transaction
        }
        return nil
    })
}
~10x faster than individual commits.

Compression Pooling

var level3EncoderPool = sync.Pool{
    New: func() interface{} {
        enc, _ := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedDefault))
        return enc
    },
}
Reuses zstd encoders to avoid allocation overhead.
  • parser - Markdown parsing for cache keys
  • renderer - Template hashing
  • search - Search record storage

Build docs developers (and LLMs) love