Indexing Strategies

Efficient indexing is crucial for performance with large documents. Multiple indexing strategies can be used together.

Overview

Indexing strategies enable fast lookups and queries on large documents. Different index types serve different purposes:

  • Document Index: Fast node lookup by ID or path
  • Position Index: Fast position-to-path conversion
  • Content Index: Full-text search capabilities

Document Index

Node Indexing

Index nodes by ID for O(1) lookup:

class DocumentIndex {
  #nodeMap = new Map(); // nodeId -> Node
  #pathMap = new Map(); // nodeId -> path
  #parentMap = new Map(); // nodeId -> parentId
  
  index(document) {
    this.#indexNode(document, []);
  }
  
  #indexNode(node, path) {
    const nodeId = node.id || this.#generateId(node, path);
    node.id = nodeId;
    
    this.#nodeMap.set(nodeId, node);
    this.#pathMap.set(nodeId, path);
    
    if (path.length > 0) {
      const parentPath = path.slice(0, -1);
      const parentId = this.#getNodeIdAtPath(parentPath);
      this.#parentMap.set(nodeId, parentId);
    }
    
    // Index children
    node.children?.forEach((child, index) => {
      this.#indexNode(child, [...path, index]);
    });
  }
  
  getNodeById(id) {
    return this.#nodeMap.get(id);
  }
  
  getPathById(id) {
    return this.#pathMap.get(id);
  }
  
  getParentId(id) {
    return this.#parentMap.get(id);
  }
}

Path Indexing

Index paths for fast path-based lookups:

class PathIndex {
  #pathToNode = new Map(); // path -> Node
  #nodeToPath = new Map(); // nodeId -> path
  
  index(document) {
    this.#indexPath(document, []);
  }
  
  #indexPath(node, path) {
    const pathKey = this.#pathToString(path);
    this.#pathToNode.set(pathKey, node);
    
    if (node.id) {
      this.#nodeToPath.set(node.id, path);
    }
    
    node.children?.forEach((child, index) => {
      this.#indexPath(child, [...path, index]);
    });
  }
  
  getNodeByPath(path) {
    const pathKey = this.#pathToString(path);
    return this.#pathToNode.get(pathKey);
  }
  
  getPathByNodeId(nodeId) {
    return this.#nodeToPath.get(nodeId);
  }
  
  #pathToString(path) {
    return path.join(',');
  }
}

Position Index

Text Positioning

Index text positions for fast range queries:

class PositionIndex {
  #textPositions = [];
  #charIndexToPath = new Map();
  
  index(document) {
    let charIndex = 0;
    this.#indexNode(document, [], charIndex);
  }
  
  #indexNode(node, path, charIndex) {
    if (node.type === 'text') {
      // Index each character position
      for (let i = 0; i < node.text.length; i++) {
        this.#textPositions.push({
          path,
          offset: i,
          charIndex: charIndex + i
        });
        this.#charIndexToPath.set(charIndex + i, { path, offset: i });
      }
      return charIndex + node.text.length;
    }
    
    // Index children
    let currentIndex = charIndex;
    node.children?.forEach((child, index) => {
      currentIndex = this.#indexNode(child, [...path, index], currentIndex);
    });
    
    return currentIndex;
  }
  
  pathToCharIndex(path, offset) {
    const pos = this.#textPositions.find(
      p => this.#pathsEqual(p.path, path) && p.offset === offset
    );
    return pos ? pos.charIndex : -1;
  }
  
  charIndexToPath(charIndex) {
    return this.#charIndexToPath.get(charIndex);
  }
}

Range Queries

Support efficient range queries:

class RangeIndex extends PositionIndex {
  getRange(startPath, startOffset, endPath, endOffset) {
    const startIndex = this.pathToCharIndex(startPath, startOffset);
    const endIndex = this.pathToCharIndex(endPath, endOffset);
    
    if (startIndex === -1 || endIndex === -1) {
      return null;
    }
    
    // Extract text between positions
    return this.#extractText(startIndex, endIndex);
  }
  
  #extractText(startIndex, endIndex) {
    const positions = this.#textPositions.filter(
      p => p.charIndex >= startIndex && p.charIndex < endIndex
    );
    
    // Group by path and extract text
    const textByPath = new Map();
    positions.forEach(pos => {
      if (!textByPath.has(pos.path)) {
        textByPath.set(pos.path, []);
      }
      textByPath.get(pos.path).push(pos);
    });
    
    // Build text string
    let text = '';
    textByPath.forEach((positions, path) => {
      positions.sort((a, b) => a.offset - b.offset);
      const node = this.#getNodeByPath(path);
      positions.forEach(pos => {
        text += node.text[pos.offset];
      });
    });
    
    return text;
  }
}

Content Index

Inverted Index

Index content for search and queries:

class ContentIndex {
  #invertedIndex = new Map(); // term -> nodeIds
  #nodeContent = new Map(); // nodeId -> content
  
  index(document) {
    this.#indexNode(document);
  }
  
  #indexNode(node) {
    if (node.type === 'text') {
      const nodeId = node.id;
      const content = node.text;
      
      this.#nodeContent.set(nodeId, content);
      
      // Tokenize and index
      const terms = this.#tokenize(content);
      terms.forEach(term => {
        if (!this.#invertedIndex.has(term)) {
          this.#invertedIndex.set(term, new Set());
        }
        this.#invertedIndex.get(term).add(nodeId);
      });
    }
    
    // Index children
    node.children?.forEach(child => this.#indexNode(child));
  }
  
  #tokenize(text) {
    // Simple tokenization
    return text.toLowerCase()
      .split(/s+/)
      .filter(term => term.length > 0);
  }

Search Operations

Support various search operations:

class SearchIndex extends ContentIndex {
  search(query) {
    const terms = this.#tokenize(query);
    
    // Find nodes containing all terms (AND search)
    let result = null;
    
    terms.forEach(term => {
      const nodeIds = this.#invertedIndex.get(term);
      if (!nodeIds) {
        return [];
      }
      
      if (result === null) {
        result = new Set(nodeIds);
      } else {
        // Intersection
        result = new Set([...result].filter(id => nodeIds.has(id)));
      }
    });
    
    return result ? Array.from(result) : [];
  }
  
  searchOr(query) {
    const terms = this.#tokenize(query);
    const result = new Set();
    
    terms.forEach(term => {
      const nodeIds = this.#invertedIndex.get(term);
      if (nodeIds) {
        nodeIds.forEach(id => result.add(id));
      }
    });
    
    return Array.from(result);
  }
  
  searchPhrase(phrase) {
    // Find exact phrase matches
    const terms = this.#tokenize(phrase);
    const candidates = this.search(phrase);
    
    // Verify phrase order
    return candidates.filter(nodeId => {
      const content = this.#nodeContent.get(nodeId);
      return content.toLowerCase().includes(phrase.toLowerCase());
    });
  }
}

Index Maintenance

Efficiently update indexes when document changes:

class IndexManager {
  #indexes = [];
  
  addIndex(index) {
    this.#indexes.push(index);
  }
  
  index(document) {
    this.#indexes.forEach(index => {
      index.index(document);
    });
  }
  
  updateIndex(change) {
    this.#indexes.forEach(index => {
      index.updateIndex(change);
    });
  }
  
  // Incremental update
  updateIncremental(change) {
    switch (change.type) {
      case 'insert':
        this.#handleInsert(change);
        break;
      case 'delete':
        this.#handleDelete(change);
        break;
      case 'update':
        this.#handleUpdate(change);
        break;
      case 'move':
        this.#handleMove(change);
        break;
    }
  }
  
  #handleInsert(change) {
    // Insert new node into indexes
    this.#indexes.forEach(index => {
      index.insertNode(change.node, change.path);
    });
  }
  
  #handleDelete(change) {
    // Remove node from indexes
    this.#indexes.forEach(index => {
      index.deleteNode(change.path);
    });
  }
  
  #handleUpdate(change) {
    // Update node in indexes
    this.#indexes.forEach(index => {
      index.updateNode(change.path, change.node);
    });
  }
  
  #handleMove(change) {
    // Move node in indexes
    this.#indexes.forEach(index => {
      index.moveNode(change.path, change.newPath);
    });
  }
}