Skip to main content
Nanahoshi provides powerful full-text search capabilities using Elasticsearch. The search system is optimized for Japanese text with the Sudachi analyzer and supports multi-field matching with dynamic boosting.

Elasticsearch configuration

The search index uses a custom Elasticsearch schema with Japanese language analysis:
packages/api/src/infrastructure/search/elasticsearch/search.client.ts
import { Client, HttpConnection } from "@elastic/elasticsearch";
import { env } from "@nanahoshi-v2/env/server";

export const esClient = new Client({
  node: env.ELASTICSEARCH_NODE,
  Connection: HttpConnection,
});

const INDEX_NAME = `${env.ELASTICSEARCH_INDEX_PREFIX}_books`;

Index schema

The schema is defined in search.schema.json with custom analyzers:
packages/api/src/infrastructure/search/elasticsearch/search.schema.json
{
  "settings": {
    "analysis": {
      "tokenizer": {
        "ja_sudachi_tokenizer": {
          "type": "sudachi_tokenizer",
          "split_mode": "C",
          "discard_punctuation": true
        }
      },
      "filter": {
        "sudachi_baseform": { "type": "sudachi_baseform" },
        "sudachi_normalizeform": { "type": "sudachi_normalizedform" },
        "sudachi_reading_katakana": {
          "type": "sudachi_readingform",
          "reading_form_type": "surface"
        },
        "katakana_to_hiragana": {
          "type": "icu_transform",
          "id": "Katakana-Hiragana"
        }
      },
      "char_filter": {
        "romaji_to_kana": {
          "type": "mapping",
          "mappings": [
            "ka=>か", "ki=>き", "ku=>く",
            "sha=>しゃ", "chi=>ち", "tsu=>つ"
            // ... (200+ romaji mappings)
          ]
        }
      },
      "analyzer": {
        "ja_surface_index_analyzer": {
          "type": "custom",
          "char_filter": ["normalize"],
          "tokenizer": "ja_sudachi_tokenizer",
          "filter": ["sudachi_pos_filter", "sudachi_split_search", "lowercase"]
        },
        "ja_kana_index_analyzer": {
          "type": "custom",
          "char_filter": ["normalize", "romaji_to_kana"],
          "tokenizer": "ja_sudachi_tokenizer",
          "filter": [
            "sudachi_pos_filter",
            "sudachi_reading_katakana",
            "katakana_to_hiragana",
            "sudachi_split_search",
            "lowercase"
          ]
        }
      }
    }
  }
}

Field mappings

Each text field has multiple sub-fields for different matching strategies:
"title": {
  "type": "text",
  "analyzer": "ja_surface_index_analyzer",
  "search_analyzer": "ja_surface_search_analyzer",
  "fields": {
    "keyword": { "type": "keyword" },
    "baseform": {
      "type": "text",
      "analyzer": "ja_baseform_index_analyzer"
    },
    "normalized": {
      "type": "text",
      "analyzer": "ja_normalized_index_analyzer"
    },
    "kana": {
      "type": "text",
      "analyzer": "ja_kana_index_analyzer"
    }
  }
}
Authors use nested objects to enable proper highlighting:
"authors": {
  "type": "nested",
  "properties": {
    "name": {
      "type": "text",
      "analyzer": "ja_surface_index_analyzer",
      "fields": {
        "keyword": { "type": "keyword" },
        "baseform": { ... },
        "normalized": { ... },
        "kana": { ... }
      }
    },
    "role": { "type": "keyword" }
  }
}

Japanese language support

Sudachi tokenizer

Nanahoshi uses the Sudachi tokenizer (not Kuromoji) for superior Japanese text analysis:

Better compound handling

Sudachi excels at splitting compound words and proper nouns correctly

Multiple tokenization modes

Mode C (longest) provides comprehensive coverage for search

Dictionary normalization

Built-in dictionary for baseform and normalized form variants

Reading forms

Extracts katakana readings for kana-based search

Script detection

The search query builder detects the input script and adjusts field boosting:
packages/api/src/infrastructure/search/elasticsearch/search.query.ts
const KANJI_REGEX = /[\u4E00-\u9FFF\u3400-\u4DBF]/;
const KANA_REGEX = /[\u3040-\u309F\u30A0-\u30FF]/;

function detectInputScript(query: string): InputScript {
  if (KANJI_REGEX.test(query)) return "kanji";
  if (KANA_REGEX.test(query)) return "kana";
  return "romaji";
}

Dynamic field boosting

Boost values change based on detected script:
packages/api/src/infrastructure/search/elasticsearch/search.query.ts
const BOOSTS: Record<InputScript, Record<string, number>> = {
  kanji: {
    title: 10,
    "title.baseform": 5,
    "title.normalized": 4,
    "title.kana": 0,           // Skip kana fields for kanji input
    "authors.name": 8,
    description: 2,
    titleRomaji: 0,              // Skip romaji for kanji input
  },
  kana: {
    title: 10,
    "title.baseform": 5,
    "title.kana": 3,             // Enable kana fields
    "authors.name.kana": 3,
    description: 2,
  },
  romaji: {
    title: 3,                    // Lower boost for surface form
    "title.kana": 5,             // Prefer kana fields (after romaji->kana conversion)
    "authors.name.kana": 5,
    titleRomaji: 8,              // Highest boost for dedicated romaji field
  },
};
Romaji queries use the romaji_to_kana char filter to convert input like “tokyo” to “とうきょう” before analysis.

Search API

The search client provides a type-safe interface:
packages/api/src/infrastructure/search/elasticsearch/search.types.ts
export interface SearchBooksRequest {
  query?: string;
  exactMatch?: boolean;
  limit?: number;              // Max 50
  cursor?: string;             // For pagination
  sort?: SearchSort;
  filters?: {
    languageCode?: string[];
    publishedDateRange?: { from?: string; to?: string };
    pageCountRange?: { min?: number; max?: number };
    authors?: string[];
    series?: string[];
    publishers?: string[];
  };
}

export type SearchSort = "relevance" | "newest" | "oldest" | "title_asc" | "title_desc";

export interface SearchBooksResponse {
  books: SearchBookHit[];
  pagination: {
    cursor?: string;
    hasMore: boolean;
    totalHits: number;
    totalHitsRelation: "eq" | "gte";
  };
}

Building search queries

packages/api/src/infrastructure/search/elasticsearch/search.query.ts
function buildTextQuery(
  queryText: string,
  exactMatch: boolean,
  script: InputScript,
): QueryDslQueryContainer {
  const boosts = BOOSTS[script];
  const effectiveQuery = exactMatch ? `"${queryText}"` : queryText;

  // Top-level fields query
  const topLevelFields = TOP_LEVEL_TEXT_FIELDS
    .filter((f) => (boosts[f] ?? 0) > 0)
    .map((f) => `${f}^${boosts[f]}`);

  const topLevelQuery: QueryDslQueryContainer = {
    simple_query_string: {
      query: effectiveQuery,
      fields: topLevelFields,
      default_operator: "and",
      analyze_wildcard: true,
    },
  };

  // Nested author query with inner_hits for highlighting
  const nestedAuthorQuery: QueryDslQueryContainer = {
    nested: {
      path: "authors",
      query: {
        simple_query_string: {
          query: effectiveQuery,
          fields: authorFields,
          default_operator: "and",
        },
      },
      inner_hits: {
        _source: false,
        highlight: {
          fields: {
            "authors.name": {
              type: "unified",
              number_of_fragments: 1,
              pre_tags: ["<em>"],
              post_tags: ["</em>"],
            },
          },
        },
      },
    },
  };

  return {
    dis_max: {
      queries: [topLevelQuery, nestedAuthorQuery],
      tie_breaker: 0.1,
    },
  };
}

Indexing books

Single document indexing

packages/api/src/infrastructure/search/elasticsearch/search.client.ts
export async function indexBook(book: Record<string, unknown>): Promise<void> {
  await esClient.index({
    index: INDEX_NAME,
    id: String(book.id),
    document: book,
  });
}

Bulk indexing

export async function indexBooksBulk(
  books: Record<string, unknown>[],
  chunkSize = 500,
): Promise<{ indexed: number; errors: number }> {
  let totalIndexed = 0;
  let totalErrors = 0;

  for (let i = 0; i < books.length; i += chunkSize) {
    const chunk = books.slice(i, i + chunkSize);
    const operations = chunk.flatMap((doc) => [
      { index: { _index: INDEX_NAME, _id: String(doc.id) } },
      doc,
    ]);

    const result = await esClient.bulk({ refresh: false, operations });
    if (result.errors) {
      const failed = result.items.filter((item) => item.index?.error);
      totalErrors += failed.length;
    }
    totalIndexed += chunk.length - (result.errors ? failed.length : 0);
  }

  return { indexed: totalIndexed, errors: totalErrors };
}

Automatic indexing

Books are indexed automatically by the file event worker after metadata extraction:
packages/api/src/infrastructure/workers/file.event.worker.ts
if (bookInserted) {
  await bookMetadataService.enrichAndSaveMetadata({
    bookId: bookInserted.id,
    uuid: bookInserted.uuid,
  });

  // Index the new book in ES
  const esDoc = await fetchBookForIndex(bookInserted.id);
  if (esDoc) {
    await indexBook(esDoc).catch((err) =>
      console.error(`[Worker] ES index failed for book ${bookInserted.id}:`, err)
    );
  }
}

Schema versioning

The index schema includes a hash for automatic recreation on changes:
packages/api/src/infrastructure/search/elasticsearch/search.client.ts
const schemaPath = resolve(import.meta.dirname, "search.schema.json");
const schemaContent = readFileSync(schemaPath, "utf-8");
const schema = JSON.parse(schemaContent);
const schemaHash = createHash("sha256")
  .update(schemaContent)
  .digest("hex")
  .slice(0, 16);

export async function ensureIndex(): Promise<void> {
  const exists = await esClient.indices.exists({ index: INDEX_NAME });

  if (!exists) {
    await esClient.indices.create({
      index: INDEX_NAME,
      settings: schema.settings,
      mappings: {
        ...schema.mappings,
        _meta: { schema_hash: schemaHash },
      },
    });
    return;
  }

  // Check if schema changed
  const mapping = await esClient.indices.getMapping({ index: INDEX_NAME });
  const existingHash = mapping[INDEX_NAME]?.mappings?._meta?.schema_hash;

  if (existingHash !== schemaHash) {
    console.log(`[ES] Schema changed, recreating index`);
    await recreateIndex();
  }
}
Changing the schema will trigger a full index recreation. Make sure to re-index all books after schema changes.

Cursor-based pagination

Search uses search_after for efficient deep pagination:
packages/api/src/infrastructure/search/elasticsearch/search.query.ts
function encodeCursor(sortValues: unknown[]): string {
  return Buffer.from(JSON.stringify(sortValues)).toString("base64url");
}

function decodeCursor(cursor: string): unknown[] {
  return JSON.parse(Buffer.from(cursor, "base64url").toString("utf-8"));
}

export function buildSearchRequest(
  indexName: string,
  request: SearchBooksRequest,
): SearchRequest {
  const body: SearchRequest = {
    index: indexName,
    query,
    sort,
    size: limit,
  };

  if (request.cursor) {
    body.search_after = decodeCursor(request.cursor);
  }

  return body;
}

Example queries

const results = await searchBooks({
  query: "村上春樹",
  limit: 20,
});
// Automatically detects kanji and boosts appropriate fields

Build docs developers (and LLMs) love