HTML Mapping

Converting between your model and HTML: serialization (model → HTML), deserialization (HTML → model), normalization, and incremental updates.

Overview

HTML mapping is the bridge between your abstract document model and the DOM. You need bidirectional conversion: model to HTML for rendering, and HTML to model for parsing pasted content or loading saved documents.

The challenge is that HTML is messy and inconsistent, while your model is clean and validated. Normalization and careful parsing are essential.

Serialization (Model → HTML)

Convert your model to HTML for rendering:

Node Serialization

function serializeNode(node) {
  switch (node.type) {
    case 'document':
      return serializeChildren(node.children);
      
    case 'paragraph':
      return '<p>' + serializeChildren(node.children) + '</p>';
      
    case 'heading':
      const level = node.attrs?.level || 1;
      return '<h' + level + '>' + serializeChildren(node.children) + '</h' + level + '>';
      
    case 'text':
      return serializeText(node);
      
    case 'link':
      const href = escapeHtml(node.attrs?.href || '');
      return '<a href="' + href + '">' + serializeChildren(node.children) + '</a>';
      
    case 'image':
      const src = escapeHtml(node.attrs?.src || '');
      const alt = escapeHtml(node.attrs?.alt || '');
      return '<img src="' + src + '" alt="' + alt + '">';
      
    default:
      return serializeChildren(node.children);
  }
}

function serializeChildren(children) {
  return children
    .map(child => serializeNode(child))
    .join('');
}

Mark Serialization

Apply marks to text nodes:

function serializeText(node) {
  let html = escapeHtml(node.text);
  
  // Apply marks in order
  if (node.marks && node.marks.length > 0) {
    // Sort marks by priority if needed
    const sortedMarks = sortMarks(node.marks);
    
    sortedMarks.forEach(mark => {
      html = wrapWithMark(html, mark);
    });
  }
  
  return html;
}

function wrapWithMark(html, mark) {
  const tagMap = {
    bold: 'strong',
    italic: 'em',
    underline: 'u',
    strikethrough: 's',
    code: 'code'
  };
  
  const tag = tagMap[mark.type];
  if (!tag) return html;
  
  const attrs = mark.attrs ? serializeAttrs(mark.attrs) : '';
  return '<' + tag + attrs + '>' + html + '</' + tag + '>';
}

// Example: text with multiple marks
// Input: { type: 'text', text: 'Bold italic', marks: [{ type: 'bold' }, { type: 'italic' }] }
// Output: '<strong><em>Bold italic</em></strong>'
// Or: '<em><strong>Bold italic</strong></em>' (order matters for some marks)

Attribute Serialization

function serializeAttrs(attrs) {
  const parts = [];
  
  for (const [key, value] of Object.entries(attrs)) {
    if (value !== null && value !== undefined) {
      const escaped = escapeHtml(String(value));
      parts.push(key + '="' + escaped + '"');
    }
  }
  
  return parts.length > 0 ? ' ' + parts.join(' ') : '';
}

// Example
serializeAttrs({ href: 'https://example.com', title: 'Example' })
// Returns: ' href="https://example.com" title="Example"'

function escapeHtml(text) {
  const div = document.createElement('div');
  div.textContent = text;
  return div.innerHTML;
}

Deserialization (HTML → Model)

Parse HTML into your model:

HTML Parsing

function parseHTML(html) {
  // Parse HTML string to DOM
  const parser = new DOMParser();
  const doc = parser.parseFromString(html, 'text/html');
  
  // Convert DOM to model
  return {
    type: 'document',
    children: Array.from(doc.body.childNodes)
      .map(node => parseNode(node))
      .filter(Boolean)
  };
}

function parseNode(domNode) {
  if (domNode.nodeType === Node.TEXT_NODE) {
    // Text node
    return {
      type: 'text',
      text: domNode.textContent,
      marks: extractMarks(domNode)
    };
  }
  
  if (domNode.nodeType === Node.ELEMENT_NODE) {
    // Element node
    const nodeType = getNodeType(domNode.tagName);
    
    if (!nodeType) {
      // Unknown element, unwrap and parse children
      return parseChildren(domNode.childNodes);
    }
    
    return {
      type: nodeType,
      attrs: extractAttributes(domNode, nodeType),
      children: parseChildren(domNode.childNodes)
    };
  }
  
  // Ignore other node types
  return null;
}

function parseChildren(domNodes) {
  return Array.from(domNodes)
    .map(node => parseNode(node))
    .filter(Boolean);
}

Node Parsing

function getNodeType(tagName) {
  const tagMap = {
    'P': 'paragraph',
    'H1': 'heading',
    'H2': 'heading',
    'H3': 'heading',
    'H4': 'heading',
    'H5': 'heading',
    'H6': 'heading',
    'A': 'link',
    'IMG': 'image',
    'STRONG': null,  // Handled as mark
    'B': null,       // Handled as mark
    'EM': null,      // Handled as mark
    'I': null,       // Handled as mark
    'U': null,       // Handled as mark
    'CODE': null,    // Handled as mark
  };
  
  return tagMap[tagName.toUpperCase()] || null;
}

function extractAttributes(domElement, nodeType) {
  const attrs = {};
  
  if (nodeType === 'heading') {
    const level = parseInt(domElement.tagName[1]) || 1;
    attrs.level = level;
  }
  
  if (nodeType === 'link') {
    attrs.href = domElement.getAttribute('href') || '';
    attrs.title = domElement.getAttribute('title') || '';
  }
  
  if (nodeType === 'image') {
    attrs.src = domElement.getAttribute('src') || '';
    attrs.alt = domElement.getAttribute('alt') || '';
  }
  
  return attrs;
}

Mark Extraction

function extractMarks(textNode) {
  const marks = [];
  let current = textNode.parentElement;
  
  // Walk up the DOM tree to find formatting elements
  while (current && current !== editor) {
    const mark = getMarkFromElement(current);
    if (mark) {
      marks.push(mark);
    }
    current = current.parentElement;
  }
  
  return marks;
}

function getMarkFromElement(element) {
  const markMap = {
    'STRONG': { type: 'bold' },
    'B': { type: 'bold' },
    'EM': { type: 'italic' },
    'I': { type: 'italic' },
    'U': { type: 'underline' },
    'S': { type: 'strikethrough' },
    'CODE': { type: 'code' },
    'A': {
      type: 'link',
      attrs: {
        href: element.getAttribute('href') || '',
        title: element.getAttribute('title') || ''
      }
    }
  };
  
  const tagName = element.tagName.toUpperCase();
  return markMap[tagName] || null;
}

// Example: <strong><em>Text</em></strong>
// Text node has marks: [{ type: 'bold' }, { type: 'italic' }]

HTML Normalization

Normalize inconsistent HTML to match your schema:

Element Normalization

function normalizeHTML(html) {
  const parser = new DOMParser();
  const doc = parser.parseFromString(html, 'text/html');
  
  // Normalize elements
  normalizeElements(doc.body);
  
  // Fix structure
  fixStructure(doc.body);
  
  return doc.body.innerHTML;
}

function normalizeElements(element) {
  // Convert b to strong
  element.querySelectorAll('b').forEach(b => {
    const strong = document.createElement('strong');
    strong.innerHTML = b.innerHTML;
    b.parentNode.replaceChild(strong, b);
  });
  
  // Convert i to em
  element.querySelectorAll('i').forEach(i => {
    const em = document.createElement('em');
    em.innerHTML = i.innerHTML;
    i.parentNode.replaceChild(em, i);
  });
  
  // Convert div to p (if appropriate)
  element.querySelectorAll('div').forEach(div => {
    if (!div.querySelector('p, ul, ol, h1, h2, h3, h4, h5, h6, table')) {
      const p = document.createElement('p');
      p.innerHTML = div.innerHTML;
      div.parentNode.replaceChild(p, div);
    }
  });
  
  // Remove style and class attributes
  element.querySelectorAll('[style]').forEach(el => {
    el.removeAttribute('style');
  });
  element.querySelectorAll('[class]').forEach(el => {
    el.removeAttribute('class');
  });
}

Structure Normalization

function fixStructure(element) {
  // Ensure blocks are direct children of body/document
  const blocks = ['P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'UL', 'OL', 'BLOCKQUOTE'];
  
  // Unwrap blocks that are nested in other blocks
  element.querySelectorAll(blocks.join(',')).forEach(block => {
    const parent = block.parentElement;
    if (parent && blocks.includes(parent.tagName)) {
      // Block inside block, unwrap
      const grandparent = parent.parentElement;
      if (grandparent) {
        grandparent.insertBefore(block, parent);
        if (!parent.hasChildNodes()) {
          parent.remove();
        }
      }
    }
  });
  
  // Merge adjacent text nodes
  mergeTextNodes(element);
  
  // Remove empty nodes (except br)
  removeEmptyNodes(element);
}

function mergeTextNodes(element) {
  const walker = document.createTreeWalker(
    element,
    NodeFilter.SHOW_TEXT,
    null
  );
  
  let prevNode = null;
  let node;
  
  while (node = walker.nextNode()) {
    if (prevNode && prevNode.parentNode === node.parentNode) {
      prevNode.textContent += node.textContent;
      node.remove();
    } else {
      prevNode = node;
    }
  }
}

function removeEmptyNodes(element) {
  const walker = document.createTreeWalker(
    element,
    NodeFilter.SHOW_ELEMENT,
    null
  );
  
  const toRemove = [];
  let node;
  
  while (node = walker.nextNode()) {
    if (node.tagName === 'BR') continue;
    
    if (!node.hasChildNodes() || 
        (node.textContent.trim() === '' && !node.querySelector('br, img'))) {
      toRemove.push(node);
    }
  }
  
  toRemove.forEach(node => node.remove());
}

Incremental Updates

Instead of re-rendering the entire document, update only changed parts:

Diff Algorithm

function updateDOM(oldModel, newModel, domRoot) {
  // Compare models and find differences
  const diff = diffModels(oldModel, newModel);
  
  // Apply changes to DOM
  diff.forEach(change => {
    applyChange(change, domRoot);
  });
}

function diffModels(oldModel, newModel) {
  const changes = [];
  
  // Compare children
  const oldChildren = oldModel.children || [];
  const newChildren = newModel.children || [];
  
  // Simple diff: find added, removed, modified nodes
  const maxLen = Math.max(oldChildren.length, newChildren.length);
  
  for (let i = 0; i < maxLen; i++) {
    const oldChild = oldChildren[i];
    const newChild = newChildren[i];
    
    if (!oldChild && newChild) {
      // Added
      changes.push({
        type: 'insert',
        index: i,
        node: newChild
      });
    } else if (oldChild && !newChild) {
      // Removed
      changes.push({
        type: 'remove',
        index: i
      });
    } else if (oldChild && newChild) {
      // Check if modified
      if (!nodesEqual(oldChild, newChild)) {
        changes.push({
          type: 'update',
          index: i,
          oldNode: oldChild,
          newNode: newChild
        });
      }
    }
  }
  
  return changes;
}

function nodesEqual(node1, node2) {
  if (node1.type !== node2.type) return false;
  if (node1.type === 'text') {
    return node1.text === node2.text &&
           marksEqual(node1.marks, node2.marks);
  }
  // Compare other properties...
  return true;
}

DOM Patching

function applyChange(change, domRoot) {
  const domNode = findDOMNode(change.index, domRoot);
  
  switch (change.type) {
    case 'insert':
      const newElement = renderNode(change.node);
      if (domNode) {
        domNode.parentNode.insertBefore(newElement, domNode);
      } else {
        domRoot.appendChild(newElement);
      }
      break;
      
    case 'remove':
      if (domNode) {
        domNode.remove();
      }
      break;
      
    case 'update':
      if (domNode) {
        // Update in place
        updateDOMNode(domNode, change.oldNode, change.newNode);
      }
      break;
  }
}

function updateDOMNode(domNode, oldNode, newNode) {
  if (newNode.type === 'text') {
    // Update text content
    if (domNode.nodeType === Node.TEXT_NODE) {
      domNode.textContent = newNode.text;
    } else {
      // Replace element with text node
      const textNode = document.createTextNode(newNode.text);
      domNode.parentNode.replaceChild(textNode, domNode);
    }
    
    // Update marks
    updateMarks(domNode, oldNode.marks, newNode.marks);
  } else {
    // Update element
    updateElement(domNode, oldNode, newNode);
  }
}

Edge Cases

Nested Marks

HTML can have nested formatting: <strong><em>text</em></strong>

// HTML: <strong><em>Bold italic</em></strong>
// Model: Single text node with both marks
{
  type: 'text',
  text: 'Bold italic',
  marks: [
    { type: 'bold' },
    { type: 'italic' }
  ]
}

// When parsing, collect all marks from parent chain
function extractMarks(textNode) {
  const marks = [];
  let current = textNode.parentElement;
  
  while (current && current !== editor) {
    const mark = getMarkFromElement(current);
    if (mark) {
      marks.push(mark);
    }
    current = current.parentElement;
  }
  
  return marks;
}

// When serializing, apply marks in order
function serializeText(node) {
  let html = escapeHtml(node.text);
  node.marks.forEach(mark => {
    html = wrapWithMark(html, mark);
  });
  return html;
}

Empty Nodes

Handle empty paragraphs, empty lists, etc.:

// Empty paragraph
{
  type: 'paragraph',
  children: []
}

// Serialize to: <p><br></p> or <p>&nbsp;</p>
function serializeNode(node) {
  if (node.type === 'paragraph' && node.children.length === 0) {
    return '<p><br></p>';
  }
  // ...
}

// Parse empty paragraph
function parseNode(domNode) {
  if (domNode.tagName === 'P' && domNode.textContent.trim() === '') {
    return {
      type: 'paragraph',
      children: []
    };
  }
  // ...
}

Whitespace Handling

HTML collapses whitespace, but you might want to preserve it:

// Preserve whitespace in code blocks
{
  type: 'codeBlock',
  children: [
    { type: 'text', text: '  const x = 1;
  const y = 2;' }
  ]
}

// Serialize with <pre><code>
function serializeNode(node) {
  if (node.type === 'codeBlock') {
    return '<pre><code>' + escapeHtml(node.children[0].text) + '</code></pre>';
  }
}

// Parse preserving whitespace
function parseNode(domNode) {
  if (domNode.tagName === 'PRE' || domNode.tagName === 'CODE') {
    return {
      type: 'text',
      text: domNode.textContent,  // Preserves whitespace
      marks: domNode.tagName === 'CODE' ? [{ type: 'code' }] : []
    };
  }
}