Skip to main content
The search package provides a full-text search engine with industry-standard BM25 ranking, fuzzy matching, phrase search, and Porter stemming.

Overview

Search features:
  • BM25 Scoring: Industry-standard relevance ranking algorithm
  • Fuzzy Matching: Typo tolerance with Levenshtein distance ≤ 2
  • Phrase Search: Exact phrase matching with quotes ("machine learning")
  • Stemming: Porter stemmer reduces words to roots (runningrun)
  • Stop Words: Filters 115+ common English words
  • Trigram Index: Fast fuzzy candidate lookup
  • Tag Filtering: Search within specific tags (tag:architecture)
  • Unicode Support: Full Unicode-aware tokenization

Core Types

Result

type Result struct {
    ID          int     // Post ID in index
    Title       string  // Post title
    Link        string  // Post URL
    Description string  // Short description
    Snippet     string  // Context snippet with highlighting
    Version     string  // Documentation version
    Score       float64 // BM25 relevance score
}

Analyzer

type Analyzer struct {
    useStopWords bool // Filter stop words
    useStemming  bool // Apply Porter stemming
}

ParsedQuery

type ParsedQuery struct {
    Terms   []string // Individual search terms
    Phrases []string // Quoted phrases
    Raw     string   // Original query
}
func PerformSearch(index *models.SearchIndex, query string, versionFilter string) []Result
Executes a search query against the index. Parameters:
  • index - Pre-built search index
  • query - Search query (supports phrases, tags)
  • versionFilter - Version to filter (or "all")
Example:
import (
    "github.com/Kush-Singh-26/kosh/builder/search"
    "github.com/Kush-Singh-26/kosh/builder/models"
)

// Build index (done once during site build)
index := &models.SearchIndex{
    Posts: posts,
    Inverted: invertedIndex,
    TotalDocs: len(posts),
    AvgDocLen: avgDocLen,
    NgramIndex: ngramIndex,
}

// Perform search
results := search.PerformSearch(index, "architecture", "v4.0")
for _, r := range results {
    fmt.Printf("[%.2f] %s - %s\n", r.Score, r.Title, r.Snippet)
}

Query Syntax

results := search.PerformSearch(index, "markdown parser", "all")
// Searches for posts containing "markdown" OR "parser" (stemmed)
results := search.PerformSearch(index, `"static site generator"`, "all")
// Exact phrase match (no stemming)

Tag Filtering

results := search.PerformSearch(index, "tag:architecture", "all")
// Only posts tagged with "architecture"

results := search.PerformSearch(index, "tag:api optimization", "all")
// Posts tagged "api" containing "optimization"

Version Filtering

results := search.PerformSearch(index, "cache", "v4.0")
// Only search v4.0 documentation

results := search.PerformSearch(index, "cache", "all")
// Search all versions

Text Analysis

Analyzer

func NewAnalyzer(useStopWords, useStemming bool) *Analyzer
Creates a custom analyzer. Example:
// Default analyzer (stop words + stemming)
analyzer := search.DefaultAnalyzer
tokens := analyzer.Analyze("Running quickly through the forest")
// Returns: ["run", "quick", "forest"] (stemmed, stop words removed)

// Custom analyzer (no stemming)
analyzer := search.NewAnalyzer(true, false)
tokens := analyzer.Analyze("Running quickly through the forest")
// Returns: ["running", "quickly", "forest"] (original forms)

Tokenization

func TokenizeWithUnicode(text string) []string
Splits text into tokens with Unicode support. Example:
tokens := search.TokenizeWithUnicode("Hello, 世界! How are you?")
// Returns: ["Hello", "世界", "How", "are", "you"]

Stemming

func StemCached(word string) string
Applies Porter stemming with caching (~76x faster on repeated words). Example:
fmt.Println(search.StemCached("running"))   // "run"
fmt.Println(search.StemCached("flies"))     // "fli"
fmt.Println(search.StemCached("connected")) // "connect"
fmt.Println(search.StemCached("easily"))    // "easili"

Stop Words

func IsStopWord(word string) bool
Checks if a word is a stop word. Example:
fmt.Println(search.IsStopWord("the"))   // true
fmt.Println(search.IsStopWord("cache")) // false
Filtered stop words:
a, an, and, are, as, at, be, but, by, for, if, in, into, is, it, no, not, of, on,
or, such, that, the, their, then, there, these, they, this, to, was, will, with, ...

Fuzzy Matching

Levenshtein Distance

func LevenshteinDistance(a, b string) int
Calculates edit distance between two strings. Example:
fmt.Println(search.LevenshteinDistance("cache", "cach"))   // 1 (1 deletion)
fmt.Println(search.LevenshteinDistance("cache", "cahe"))   // 1 (1 substitution)
fmt.Println(search.LevenshteinDistance("cache", "caxhe"))  // 1 (1 substitution)
fmt.Println(search.LevenshteinDistance("cache", "parse"))  // 3 (3 substitutions)

FuzzyMatch

func FuzzyMatch(term, target string, maxDist int) bool
Checks if two strings match within maxDist edit distance. Example:
fmt.Println(search.FuzzyMatch("trnsformer", "transformer", 2)) // true (1 insertion)
fmt.Println(search.FuzzyMatch("seach", "search", 2))           // true (1 insertion)
fmt.Println(search.FuzzyMatch("xyz", "search", 2))             // false (too different)

Trigram Index

func BuildNgramIndex(inverted map[string]map[int]int) map[string][]string
Builds a trigram index for fast fuzzy lookups (~20% faster). Example:
inverted := map[string]map[int]int{
    "cache": {1: 5, 2: 3},
    "search": {3: 8},
}

ngramIndex := search.BuildNgramIndex(inverted)
// ngramIndex["cac"] = ["cache"]
// ngramIndex["ach"] = ["cache"]
// ngramIndex["che"] = ["cache"]
// ngramIndex["sea"] = ["search"]
// etc.

// Use for fast fuzzy expansion
candidates := search.FuzzyExpandWithNgrams("cach", ngramIndex, 2)
// Returns: ["cache"] (fast candidate lookup)

Phrase Parsing

func ParseQuery(query string) ParsedQuery
Parses a search query into terms and phrases. Example:
parsed := search.ParseQuery(`architecture "static site generator" performance`)

fmt.Println(parsed.Terms)   // ["architectur", "perform"] (stemmed)
fmt.Println(parsed.Phrases) // ["static site generator"] (exact)
fmt.Println(parsed.Raw)     // Original query

Snippet Extraction

func ExtractSnippet(content string, terms []string) string
Extracts a context snippet with search term highlighting. Example:
content := "The cache system uses BoltDB for persistent storage and BLAKE3 for content hashing."
terms := []string{"cache", "blake3"}

snippet := search.ExtractSnippet(content, terms)
// Returns: "...The <b>cache</b> system uses BoltDB for persistent storage and <b>BLAKE3</b> for content hashing."
Configuration:
const (
    MaxSnippetContentLength = 10000 // Max content to process
    DefaultSnippetLength    = 150   // Default snippet length
    SnippetContextBefore    = 60    // Characters before match
    SnippetContextAfter     = 90    // Characters after match
)

BM25 Scoring

BM25 (Best Matching 25) is the industry-standard relevance ranking algorithm.

Algorithm

BM25(q, d) = Σ IDF(qi) × (f(qi, d) × (k1 + 1)) / (f(qi, d) + k1 × (1 - b + b × |d| / avgdl))

Where:
- IDF(qi) = log((N - df + 0.5) / (df + 0.5) + 1)
- f(qi, d) = term frequency in document
- |d| = document length
- avgdl = average document length
- k1 = 1.2 (term saturation parameter)
- b = 0.75 (length normalization parameter)
- N = total documents
- df = document frequency for term

Score Modifiers

const (
    ScorePhraseMatch   = 15.0 // Phrase match boost
    ScoreTitleMatch    = 10.0 // Title match boost
    ScoreTagMatch      = 5.0  // Tag match boost
    ScoreFuzzyModifier = 0.7  // Fuzzy match penalty
)
Example scoring:
Term match:        BM25 score (0.0 - 10.0)
Phrase match:      +15.0
Title match:       +10.0
Tag match:         +5.0
Fuzzy match:       ×0.7 penalty

Performance Optimization

Stemming Cache

var stemCache sync.Map // Thread-safe cache

func StemCached(word string) string {
    if cached, ok := stemCache.Load(word); ok {
        return cached.(string)
    }
    stemmed := stem(word)
    stemCache.Store(word, stemmed)
    return stemmed
}
~76x speedup on repeated words.

Replacer Cache

var (
    replacerCache   = make(map[string]*strings.Replacer)
    replacerCacheMu sync.RWMutex
)

func getReplacer(term string) *strings.Replacer {
    replacerCacheMu.RLock()
    if r, ok := replacerCache[term]; ok {
        replacerCacheMu.RUnlock()
        return r
    }
    replacerCacheMu.RUnlock()
    // Build and cache replacer
}
Avoids creating new replacers for snippet highlighting.

Post Cache

postCache := make(map[int]*models.PostRecord, maxResults)

for postID, freq := range posts {
    post, cached := postCache[postID]
    if !cached {
        post = &index.Posts[postID]
        postCache[postID] = post
    }
    // Use cached post
}
Reduces repeated array lookups during scoring.

Building a Search Index

import (
    "github.com/Kush-Singh-26/kosh/builder/search"
    "github.com/Kush-Singh-26/kosh/builder/models"
)

// Create inverted index
inverted := make(map[string]map[int]int)
analyzer := search.DefaultAnalyzer

for i, post := range posts {
    tokens := analyzer.Analyze(post.Content)
    for _, token := range tokens {
        if inverted[token] == nil {
            inverted[token] = make(map[int]int)
        }
        inverted[token][i]++ // Term frequency
    }
}

// Build trigram index
ngramIndex := search.BuildNgramIndex(inverted)

// Create search index
index := &models.SearchIndex{
    Posts: posts,
    Inverted: inverted,
    NgramIndex: ngramIndex,
    TotalDocs: len(posts),
    AvgDocLen: calculateAvgDocLen(posts),
}

WASM Integration

The search engine compiles to WebAssembly for browser-side execution:
GOOS=js GOARCH=wasm go build -o search.wasm ./cmd/search
JavaScript usage:
const go = new Go();
WebAssembly.instantiateStreaming(fetch("search.wasm"), go.importObject)
    .then((result) => {
        go.run(result.instance);
        
        // Search function exposed from Go
        const results = window.performSearch("architecture", "all");
        console.log(results);
    });
  • cache - Search record storage
  • models - SearchIndex structures
  • parser - Plain text extraction

Build docs developers (and LLMs) love