Overview
HTML mapping is the bridge between your abstract document model and the DOM. You need bidirectional conversion: model to HTML for rendering, and HTML to model for parsing pasted content or loading saved documents.
The challenge is that HTML is messy and inconsistent, while your model is clean and validated. Normalization and careful parsing are essential.
Serialization (Model → HTML)
Convert your model to HTML for rendering:
Node Serialization
function serializeNode(node) {
switch (node.type) {
case 'document':
return serializeChildren(node.children);
case 'paragraph':
return '<p>' + serializeChildren(node.children) + '</p>';
case 'heading':
const level = node.attrs?.level || 1;
return '<h' + level + '>' + serializeChildren(node.children) + '</h' + level + '>';
case 'text':
return serializeText(node);
case 'link':
const href = escapeHtml(node.attrs?.href || '');
return '<a href="' + href + '">' + serializeChildren(node.children) + '</a>';
case 'image':
const src = escapeHtml(node.attrs?.src || '');
const alt = escapeHtml(node.attrs?.alt || '');
return '<img src="' + src + '" alt="' + alt + '">';
default:
return serializeChildren(node.children);
}
}
function serializeChildren(children) {
return children
.map(child => serializeNode(child))
.join('');
}Mark Serialization
Apply marks to text nodes:
function serializeText(node) {
let html = escapeHtml(node.text);
// Apply marks in order
if (node.marks && node.marks.length > 0) {
// Sort marks by priority if needed
const sortedMarks = sortMarks(node.marks);
sortedMarks.forEach(mark => {
html = wrapWithMark(html, mark);
});
}
return html;
}
function wrapWithMark(html, mark) {
const tagMap = {
bold: 'strong',
italic: 'em',
underline: 'u',
strikethrough: 's',
code: 'code'
};
const tag = tagMap[mark.type];
if (!tag) return html;
const attrs = mark.attrs ? serializeAttrs(mark.attrs) : '';
return '<' + tag + attrs + '>' + html + '</' + tag + '>';
}
// Example: text with multiple marks
// Input: { type: 'text', text: 'Bold italic', marks: [{ type: 'bold' }, { type: 'italic' }] }
// Output: '<strong><em>Bold italic</em></strong>'
// Or: '<em><strong>Bold italic</strong></em>' (order matters for some marks)Attribute Serialization
function serializeAttrs(attrs) {
const parts = [];
for (const [key, value] of Object.entries(attrs)) {
if (value !== null && value !== undefined) {
const escaped = escapeHtml(String(value));
parts.push(key + '="' + escaped + '"');
}
}
return parts.length > 0 ? ' ' + parts.join(' ') : '';
}
// Example
serializeAttrs({ href: 'https://example.com', title: 'Example' })
// Returns: ' href="https://example.com" title="Example"'
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}Deserialization (HTML → Model)
Parse HTML into your model:
HTML Parsing
function parseHTML(html) {
// Parse HTML string to DOM
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
// Convert DOM to model
return {
type: 'document',
children: Array.from(doc.body.childNodes)
.map(node => parseNode(node))
.filter(Boolean)
};
}
function parseNode(domNode) {
if (domNode.nodeType === Node.TEXT_NODE) {
// Text node
return {
type: 'text',
text: domNode.textContent,
marks: extractMarks(domNode)
};
}
if (domNode.nodeType === Node.ELEMENT_NODE) {
// Element node
const nodeType = getNodeType(domNode.tagName);
if (!nodeType) {
// Unknown element, unwrap and parse children
return parseChildren(domNode.childNodes);
}
return {
type: nodeType,
attrs: extractAttributes(domNode, nodeType),
children: parseChildren(domNode.childNodes)
};
}
// Ignore other node types
return null;
}
function parseChildren(domNodes) {
return Array.from(domNodes)
.map(node => parseNode(node))
.filter(Boolean);
}Node Parsing
function getNodeType(tagName) {
const tagMap = {
'P': 'paragraph',
'H1': 'heading',
'H2': 'heading',
'H3': 'heading',
'H4': 'heading',
'H5': 'heading',
'H6': 'heading',
'A': 'link',
'IMG': 'image',
'STRONG': null, // Handled as mark
'B': null, // Handled as mark
'EM': null, // Handled as mark
'I': null, // Handled as mark
'U': null, // Handled as mark
'CODE': null, // Handled as mark
};
return tagMap[tagName.toUpperCase()] || null;
}
function extractAttributes(domElement, nodeType) {
const attrs = {};
if (nodeType === 'heading') {
const level = parseInt(domElement.tagName[1]) || 1;
attrs.level = level;
}
if (nodeType === 'link') {
attrs.href = domElement.getAttribute('href') || '';
attrs.title = domElement.getAttribute('title') || '';
}
if (nodeType === 'image') {
attrs.src = domElement.getAttribute('src') || '';
attrs.alt = domElement.getAttribute('alt') || '';
}
return attrs;
}Mark Extraction
function extractMarks(textNode) {
const marks = [];
let current = textNode.parentElement;
// Walk up the DOM tree to find formatting elements
while (current && current !== editor) {
const mark = getMarkFromElement(current);
if (mark) {
marks.push(mark);
}
current = current.parentElement;
}
return marks;
}
function getMarkFromElement(element) {
const markMap = {
'STRONG': { type: 'bold' },
'B': { type: 'bold' },
'EM': { type: 'italic' },
'I': { type: 'italic' },
'U': { type: 'underline' },
'S': { type: 'strikethrough' },
'CODE': { type: 'code' },
'A': {
type: 'link',
attrs: {
href: element.getAttribute('href') || '',
title: element.getAttribute('title') || ''
}
}
};
const tagName = element.tagName.toUpperCase();
return markMap[tagName] || null;
}
// Example: <strong><em>Text</em></strong>
// Text node has marks: [{ type: 'bold' }, { type: 'italic' }]HTML Normalization
Normalize inconsistent HTML to match your schema:
Element Normalization
function normalizeHTML(html) {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
// Normalize elements
normalizeElements(doc.body);
// Fix structure
fixStructure(doc.body);
return doc.body.innerHTML;
}
function normalizeElements(element) {
// Convert b to strong
element.querySelectorAll('b').forEach(b => {
const strong = document.createElement('strong');
strong.innerHTML = b.innerHTML;
b.parentNode.replaceChild(strong, b);
});
// Convert i to em
element.querySelectorAll('i').forEach(i => {
const em = document.createElement('em');
em.innerHTML = i.innerHTML;
i.parentNode.replaceChild(em, i);
});
// Convert div to p (if appropriate)
element.querySelectorAll('div').forEach(div => {
if (!div.querySelector('p, ul, ol, h1, h2, h3, h4, h5, h6, table')) {
const p = document.createElement('p');
p.innerHTML = div.innerHTML;
div.parentNode.replaceChild(p, div);
}
});
// Remove style and class attributes
element.querySelectorAll('[style]').forEach(el => {
el.removeAttribute('style');
});
element.querySelectorAll('[class]').forEach(el => {
el.removeAttribute('class');
});
}Structure Normalization
function fixStructure(element) {
// Ensure blocks are direct children of body/document
const blocks = ['P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'UL', 'OL', 'BLOCKQUOTE'];
// Unwrap blocks that are nested in other blocks
element.querySelectorAll(blocks.join(',')).forEach(block => {
const parent = block.parentElement;
if (parent && blocks.includes(parent.tagName)) {
// Block inside block, unwrap
const grandparent = parent.parentElement;
if (grandparent) {
grandparent.insertBefore(block, parent);
if (!parent.hasChildNodes()) {
parent.remove();
}
}
}
});
// Merge adjacent text nodes
mergeTextNodes(element);
// Remove empty nodes (except br)
removeEmptyNodes(element);
}
function mergeTextNodes(element) {
const walker = document.createTreeWalker(
element,
NodeFilter.SHOW_TEXT,
null
);
let prevNode = null;
let node;
while (node = walker.nextNode()) {
if (prevNode && prevNode.parentNode === node.parentNode) {
prevNode.textContent += node.textContent;
node.remove();
} else {
prevNode = node;
}
}
}
function removeEmptyNodes(element) {
const walker = document.createTreeWalker(
element,
NodeFilter.SHOW_ELEMENT,
null
);
const toRemove = [];
let node;
while (node = walker.nextNode()) {
if (node.tagName === 'BR') continue;
if (!node.hasChildNodes() ||
(node.textContent.trim() === '' && !node.querySelector('br, img'))) {
toRemove.push(node);
}
}
toRemove.forEach(node => node.remove());
}Incremental Updates
Instead of re-rendering the entire document, update only changed parts:
Diff Algorithm
function updateDOM(oldModel, newModel, domRoot) {
// Compare models and find differences
const diff = diffModels(oldModel, newModel);
// Apply changes to DOM
diff.forEach(change => {
applyChange(change, domRoot);
});
}
function diffModels(oldModel, newModel) {
const changes = [];
// Compare children
const oldChildren = oldModel.children || [];
const newChildren = newModel.children || [];
// Simple diff: find added, removed, modified nodes
const maxLen = Math.max(oldChildren.length, newChildren.length);
for (let i = 0; i < maxLen; i++) {
const oldChild = oldChildren[i];
const newChild = newChildren[i];
if (!oldChild && newChild) {
// Added
changes.push({
type: 'insert',
index: i,
node: newChild
});
} else if (oldChild && !newChild) {
// Removed
changes.push({
type: 'remove',
index: i
});
} else if (oldChild && newChild) {
// Check if modified
if (!nodesEqual(oldChild, newChild)) {
changes.push({
type: 'update',
index: i,
oldNode: oldChild,
newNode: newChild
});
}
}
}
return changes;
}
function nodesEqual(node1, node2) {
if (node1.type !== node2.type) return false;
if (node1.type === 'text') {
return node1.text === node2.text &&
marksEqual(node1.marks, node2.marks);
}
// Compare other properties...
return true;
}DOM Patching
function applyChange(change, domRoot) {
const domNode = findDOMNode(change.index, domRoot);
switch (change.type) {
case 'insert':
const newElement = renderNode(change.node);
if (domNode) {
domNode.parentNode.insertBefore(newElement, domNode);
} else {
domRoot.appendChild(newElement);
}
break;
case 'remove':
if (domNode) {
domNode.remove();
}
break;
case 'update':
if (domNode) {
// Update in place
updateDOMNode(domNode, change.oldNode, change.newNode);
}
break;
}
}
function updateDOMNode(domNode, oldNode, newNode) {
if (newNode.type === 'text') {
// Update text content
if (domNode.nodeType === Node.TEXT_NODE) {
domNode.textContent = newNode.text;
} else {
// Replace element with text node
const textNode = document.createTextNode(newNode.text);
domNode.parentNode.replaceChild(textNode, domNode);
}
// Update marks
updateMarks(domNode, oldNode.marks, newNode.marks);
} else {
// Update element
updateElement(domNode, oldNode, newNode);
}
}Edge Cases
Nested Marks
HTML can have nested formatting: <strong><em>text</em></strong>
// HTML: <strong><em>Bold italic</em></strong>
// Model: Single text node with both marks
{
type: 'text',
text: 'Bold italic',
marks: [
{ type: 'bold' },
{ type: 'italic' }
]
}
// When parsing, collect all marks from parent chain
function extractMarks(textNode) {
const marks = [];
let current = textNode.parentElement;
while (current && current !== editor) {
const mark = getMarkFromElement(current);
if (mark) {
marks.push(mark);
}
current = current.parentElement;
}
return marks;
}
// When serializing, apply marks in order
function serializeText(node) {
let html = escapeHtml(node.text);
node.marks.forEach(mark => {
html = wrapWithMark(html, mark);
});
return html;
}Empty Nodes
Handle empty paragraphs, empty lists, etc.:
// Empty paragraph
{
type: 'paragraph',
children: []
}
// Serialize to: <p><br></p> or <p> </p>
function serializeNode(node) {
if (node.type === 'paragraph' && node.children.length === 0) {
return '<p><br></p>';
}
// ...
}
// Parse empty paragraph
function parseNode(domNode) {
if (domNode.tagName === 'P' && domNode.textContent.trim() === '') {
return {
type: 'paragraph',
children: []
};
}
// ...
}Whitespace Handling
HTML collapses whitespace, but you might want to preserve it:
// Preserve whitespace in code blocks
{
type: 'codeBlock',
children: [
{ type: 'text', text: ' const x = 1;
const y = 2;' }
]
}
// Serialize with <pre><code>
function serializeNode(node) {
if (node.type === 'codeBlock') {
return '<pre><code>' + escapeHtml(node.children[0].text) + '</code></pre>';
}
}
// Parse preserving whitespace
function parseNode(domNode) {
if (domNode.tagName === 'PRE' || domNode.tagName === 'CODE') {
return {
type: 'text',
text: domNode.textContent, // Preserves whitespace
marks: domNode.tagName === 'CODE' ? [{ type: 'code' }] : []
};
}
}