Fix a variety of issues with HTML → Markdown conversion (#8004)

* Fix a variety of issues with HTML → Markdown conversion Signed-off-by: Robin Townsend <robin@robin.town> * Fix lint Signed-off-by: Robin Townsend <robin@robin.town> * Fix @room pill formatting not being applied to link text Signed-off-by: Robin Townsend <robin@robin.town>
2024-11-15 20:54:59 +08:00 · 2022-03-09 07:43:05 -05:00 · 2022-03-09 07:43:05 -05:00 · c10ac9e4a0
commit c10ac9e4a0
parent 65691202f7
3 changed files with 422 additions and 260 deletions
--- a/src/editor/deserialize.ts
+++ b/src/editor/deserialize.ts
@ -17,190 +17,110 @@ limitations under the License.

 import { MatrixEvent } from "matrix-js-sdk/src/models/event";

-import { walkDOMDepthFirst } from "./dom";
 import { checkBlockNode } from "../HtmlUtils";
 import { getPrimaryPermalinkEntity } from "../utils/permalinks/Permalinks";
 import { Part, PartCreator, Type } from "./parts";
 import SdkConfig from "../SdkConfig";
 import { textToHtmlRainbow } from "../utils/colour";

-function parseAtRoomMentions(text: string, partCreator: PartCreator): Part[] {
+const LIST_TYPES = ["UL", "OL", "LI"];
+
+// Escapes all markup in the given text
+function escape(text: string): string {
+    return text.replace(/[\\*_[\]`<]|^>/g, match => `\\${match}`);
+}
+
+// Finds the length of the longest backtick sequence in the given text, used for
+// escaping backticks in code blocks
+function longestBacktickSequence(text: string): number {
+    let length = 0;
+    let currentLength = 0;
+
+    for (const c of text) {
+        if (c === "`") {
+            currentLength++;
+        } else {
+            length = Math.max(length, currentLength);
+            currentLength = 0;
+        }
+    }
+
+    return Math.max(length, currentLength);
+}
+
+function isListChild(n: Node): boolean {
+    return LIST_TYPES.includes(n.parentNode?.nodeName);
+}
+
+function parseAtRoomMentions(text: string, pc: PartCreator): Part[] {
    const ATROOM = "@room";
    const parts: Part[] = [];
    text.split(ATROOM).forEach((textPart, i, arr) => {
        if (textPart.length) {
-            parts.push(...partCreator.plainWithEmoji(textPart));
+            parts.push(...pc.plainWithEmoji(escape(textPart)));
        }
        // it's safe to never append @room after the last textPart
        // as split will report an empty string at the end if
        // `text` ended in @room.
        const isLast = i === arr.length - 1;
        if (!isLast) {
-            parts.push(partCreator.atRoomPill(ATROOM));
+            parts.push(pc.atRoomPill(ATROOM));
        }
    });
    return parts;
 }

-function parseLink(a: HTMLAnchorElement, partCreator: PartCreator): Part[] {
-    const { href } = a;
+function parseLink(n: Node, pc: PartCreator): Part[] {
+    const { href } = n as HTMLAnchorElement;
    const resourceId = getPrimaryPermalinkEntity(href); // The room/user ID
-    const prefix = resourceId ? resourceId[0] : undefined; // First character of ID
-    switch (prefix) {
-        case "@":
-            return [partCreator.userPill(a.textContent, resourceId)];
-        case "#":
-            return [partCreator.roomPill(resourceId)];
-        default: {
-            if (href === a.textContent) {
-                return partCreator.plainWithEmoji(a.textContent);
-            } else {
-                return partCreator.plainWithEmoji(`[${a.textContent.replace(/[[\\\]]/g, c => "\\" + c)}](${href})`);
-            }
-        }
+
+    switch (resourceId?.[0]) {
+        case "@": return [pc.userPill(n.textContent, resourceId)];
+        case "#": return [pc.roomPill(resourceId)];
+    }
+
+    const children = Array.from(n.childNodes);
+    if (href === n.textContent && children.every(c => c.nodeType === Node.TEXT_NODE)) {
+        return parseAtRoomMentions(n.textContent, pc);
+    } else {
+        return [pc.plain("["), ...parseChildren(n, pc), pc.plain(`](${href})`)];
    }
 }

-function parseImage(img: HTMLImageElement, partCreator: PartCreator): Part[] {
-    const { src } = img;
-    return partCreator.plainWithEmoji(`![${img.alt.replace(/[[\\\]]/g, c => "\\" + c)}](${src})`);
+function parseImage(n: Node, pc: PartCreator): Part[] {
+    const { alt, src } = n as HTMLImageElement;
+    return pc.plainWithEmoji(`![${escape(alt)}](${src})`);
 }

-function parseCodeBlock(n: HTMLElement, partCreator: PartCreator): Part[] {
-    const parts: Part[] = [];
+function parseCodeBlock(n: Node, pc: PartCreator): Part[] {
    let language = "";
-    if (n.firstChild && n.firstChild.nodeName === "CODE") {
-        for (const className of (<HTMLElement>n.firstChild).classList) {
+    if (n.firstChild?.nodeName === "CODE") {
+        for (const className of (n.firstChild as HTMLElement).classList) {
            if (className.startsWith("language-") && !className.startsWith("language-_")) {
                language = className.substr("language-".length);
                break;
            }
        }
    }
-    const preLines = ("```" + language + "\n" + n.textContent + "```").split("\n");
-    preLines.forEach((l, i) => {
-        parts.push(...partCreator.plainWithEmoji(l));
-        if (i < preLines.length - 1) {
-            parts.push(partCreator.newline());
-        }
+
+    const text = n.textContent.replace(/\n$/, "");
+    // Escape backticks by using even more backticks for the fence if necessary
+    const fence = "`".repeat(Math.max(3, longestBacktickSequence(text) + 1));
+    const parts: Part[] = [...pc.plainWithEmoji(fence + language), pc.newline()];
+
+    text.split("\n").forEach(line => {
+        parts.push(...pc.plainWithEmoji(line));
+        parts.push(pc.newline());
    });
+
+    parts.push(pc.plain(fence));
    return parts;
 }

-function parseHeader(el: HTMLElement, partCreator: PartCreator): Part {
-    const depth = parseInt(el.nodeName.substr(1), 10);
-    return partCreator.plain("#".repeat(depth) + " ");
-}
-
-interface IState {
-    listIndex: number[];
-    listDepth?: number;
-}
-
-function parseElement(
-    n: HTMLElement,
-    partCreator: PartCreator,
-    lastNode: Node | undefined,
-    state: IState,
-): Part | Part[] {
-    switch (n.nodeName) {
-        case "H1":
-        case "H2":
-        case "H3":
-        case "H4":
-        case "H5":
-        case "H6":
-            return parseHeader(n, partCreator);
-        case "A":
-            return parseLink(<HTMLAnchorElement>n, partCreator);
-        case "IMG":
-            return parseImage(<HTMLImageElement>n, partCreator);
-        case "BR":
-            return partCreator.newline();
-        case "HR":
-            // the newline arrangement here is quite specific otherwise it may be misconstrued as marking the previous
-            // text line as a header instead of acting as a horizontal rule.
-            return [
-                partCreator.newline(),
-                partCreator.plain("---"),
-                partCreator.newline(),
-            ];
-        case "EM":
-            return partCreator.plainWithEmoji(`_${n.textContent}_`);
-        case "STRONG":
-            return partCreator.plainWithEmoji(`**${n.textContent}**`);
-        case "PRE":
-            return parseCodeBlock(n, partCreator);
-        case "CODE":
-            return partCreator.plainWithEmoji(`\`${n.textContent}\``);
-        case "DEL":
-            return partCreator.plainWithEmoji(`<del>${n.textContent}</del>`);
-        case "SUB":
-            return partCreator.plainWithEmoji(`<sub>${n.textContent}</sub>`);
-        case "SUP":
-            return partCreator.plainWithEmoji(`<sup>${n.textContent}</sup>`);
-        case "U":
-            return partCreator.plainWithEmoji(`<u>${n.textContent}</u>`);
-        case "LI": {
-            const BASE_INDENT = 4;
-            const depth = state.listDepth - 1;
-            const indent = " ".repeat(BASE_INDENT * depth);
-            if (n.parentElement.nodeName === "OL") {
-                // The markdown parser doesn't do nested indexed lists at all, but this supports it anyway.
-                const index = state.listIndex[state.listIndex.length - 1];
-                state.listIndex[state.listIndex.length - 1] += 1;
-                return partCreator.plain(`${indent}${index}. `);
-            } else {
-                return partCreator.plain(`${indent}- `);
-            }
-        }
-        case "P": {
-            if (lastNode) {
-                return partCreator.newline();
-            }
-            break;
-        }
-        case "DIV":
-        case "SPAN": {
-            // math nodes are translated back into delimited latex strings
-            if (n.hasAttribute("data-mx-maths")) {
-                const delimLeft = (n.nodeName == "SPAN") ?
-                    ((SdkConfig.get()['latex_maths_delims'] || {})['inline'] || {})['left'] || "\\(" :
-                    ((SdkConfig.get()['latex_maths_delims'] || {})['display'] || {})['left'] || "\\[";
-                const delimRight = (n.nodeName == "SPAN") ?
-                    ((SdkConfig.get()['latex_maths_delims'] || {})['inline'] || {})['right'] || "\\)" :
-                    ((SdkConfig.get()['latex_maths_delims'] || {})['display'] || {})['right'] || "\\]";
-                const tex = n.getAttribute("data-mx-maths");
-                return partCreator.plainWithEmoji(delimLeft + tex + delimRight);
-            } else if (!checkDescendInto(n)) {
-                return partCreator.plainWithEmoji(n.textContent);
-            }
-            break;
-        }
-        case "OL":
-            state.listIndex.push((<HTMLOListElement>n).start || 1);
-            /* falls through */
-        case "UL":
-            state.listDepth = (state.listDepth || 0) + 1;
-            /* falls through */
-        default:
-            // don't textify block nodes we'll descend into
-            if (!checkDescendInto(n)) {
-                return partCreator.plainWithEmoji(n.textContent);
-            }
-    }
-}
-
-function checkDescendInto(node) {
-    switch (node.nodeName) {
-        case "PRE":
-            // a code block is textified in parseCodeBlock
-            // as we don't want to preserve markup in it,
-            // so no need to descend into it
-            return false;
-        default:
-            return checkBlockNode(node);
-    }
+function parseHeader(n: Node, pc: PartCreator): Part[] {
+    const depth = parseInt(n.nodeName.substr(1), 10);
+    const prefix = pc.plain("#".repeat(depth) + " ");
+    return [prefix, ...parseChildren(n, pc)];
 }

 function checkIgnored(n) {
@ -214,144 +134,169 @@ function checkIgnored(n) {
    return true;
 }

-const QUOTE_LINE_PREFIX = "> ";
-function prefixQuoteLines(isFirstNode, parts, partCreator) {
-    // a newline (to append a > to) wouldn't be added to parts for the first line
-    // if there was no content before the BLOCKQUOTE, so handle that
-    if (isFirstNode) {
-        parts.splice(0, 0, partCreator.plain(QUOTE_LINE_PREFIX));
-    }
-    for (let i = 0; i < parts.length; i += 1) {
+function prefixLines(parts: Part[], prefix: string, pc: PartCreator) {
+    parts.unshift(pc.plain(prefix));
+    for (let i = 0; i < parts.length; i++) {
        if (parts[i].type === Type.Newline) {
-            parts.splice(i + 1, 0, partCreator.plain(QUOTE_LINE_PREFIX));
+            parts.splice(i + 1, 0, pc.plain(prefix));
            i += 1;
        }
    }
 }

-function parseHtmlMessage(html: string, partCreator: PartCreator, isQuotedMessage: boolean): Part[] {
+function parseChildren(n: Node, pc: PartCreator, mkListItem?: (li: Node) => Part[]): Part[] {
+    let prev;
+    return Array.from(n.childNodes).flatMap(c => {
+        const parsed = parseNode(c, pc, mkListItem);
+        if (parsed.length && prev && (checkBlockNode(prev) || checkBlockNode(c))) {
+            if (isListChild(c)) {
+                // Use tighter spacing within lists
+                parsed.unshift(pc.newline());
+            } else {
+                parsed.unshift(pc.newline(), pc.newline());
+            }
+        }
+        if (parsed.length) prev = c;
+        return parsed;
+    });
+}
+
+function parseNode(n: Node, pc: PartCreator, mkListItem?: (li: Node) => Part[]): Part[] {
+    if (checkIgnored(n)) return [];
+
+    switch (n.nodeType) {
+        case Node.TEXT_NODE:
+            return parseAtRoomMentions(n.nodeValue, pc);
+        case Node.ELEMENT_NODE:
+            switch (n.nodeName) {
+                case "H1":
+                case "H2":
+                case "H3":
+                case "H4":
+                case "H5":
+                case "H6":
+                    return parseHeader(n, pc);
+                case "A":
+                    return parseLink(n, pc);
+                case "IMG":
+                    return parseImage(n, pc);
+                case "BR":
+                    return [pc.newline()];
+                case "HR":
+                    return [pc.plain("---")];
+                case "EM":
+                    return [pc.plain("_"), ...parseChildren(n, pc), pc.plain("_")];
+                case "STRONG":
+                    return [pc.plain("**"), ...parseChildren(n, pc), pc.plain("**")];
+                case "DEL":
+                    return [pc.plain("<del>"), ...parseChildren(n, pc), pc.plain("</del>")];
+                case "SUB":
+                    return [pc.plain("<sub>"), ...parseChildren(n, pc), pc.plain("</sub>")];
+                case "SUP":
+                    return [pc.plain("<sup>"), ...parseChildren(n, pc), pc.plain("</sup>")];
+                case "U":
+                    return [pc.plain("<u>"), ...parseChildren(n, pc), pc.plain("</u>")];
+                case "PRE":
+                    return parseCodeBlock(n, pc);
+                case "CODE": {
+                    // Escape backticks by using multiple backticks for the fence if necessary
+                    const fence = "`".repeat(longestBacktickSequence(n.textContent) + 1);
+                    return pc.plainWithEmoji(`${fence}${n.textContent}${fence}`);
+                }
+                case "BLOCKQUOTE": {
+                    const parts = parseChildren(n, pc);
+                    prefixLines(parts, "> ", pc);
+                    return parts;
+                }
+                case "LI":
+                    return mkListItem?.(n) ?? parseChildren(n, pc);
+                case "UL": {
+                    const parts = parseChildren(n, pc, li => [pc.plain("- "), ...parseChildren(li, pc)]);
+                    if (isListChild(n)) {
+                        prefixLines(parts, "    ", pc);
+                    }
+                    return parts;
+                }
+                case "OL": {
+                    let counter = 1;
+                    const parts = parseChildren(n, pc, li => {
+                        const parts = [pc.plain(`${counter}. `), ...parseChildren(li, pc)];
+                        counter++;
+                        return parts;
+                    });
+                    if (isListChild(n)) {
+                        prefixLines(parts, "    ", pc);
+                    }
+                    return parts;
+                }
+                case "DIV":
+                case "SPAN":
+                    // Math nodes are translated back into delimited latex strings
+                    if ((n as Element).hasAttribute("data-mx-maths")) {
+                        const delims = SdkConfig.get().latex_maths_delims;
+                        const delimLeft = (n.nodeName === "SPAN") ?
+                            delims?.inline?.left ?? "\\(" :
+                            delims?.display?.left ?? "\\[";
+                        const delimRight = (n.nodeName === "SPAN") ?
+                            delims?.inline?.right ?? "\\)" :
+                            delims?.display?.right ?? "\\]";
+                        const tex = (n as Element).getAttribute("data-mx-maths");
+
+                        return pc.plainWithEmoji(`${delimLeft}${tex}${delimRight}`);
+                    }
+            }
+    }
+
+    return parseChildren(n, pc);
+}
+
+function parseHtmlMessage(html: string, pc: PartCreator, isQuotedMessage: boolean): Part[] {
    // no nodes from parsing here should be inserted in the document,
    // as scripts in event handlers, etc would be executed then.
    // we're only taking text, so that is fine
-    const rootNode = new DOMParser().parseFromString(html, "text/html").body;
-    const parts: Part[] = [];
-    let lastNode: Node;
-    let inQuote = isQuotedMessage;
-    const state: IState = {
-        listIndex: [],
-    };
-
-    function onNodeEnter(n: Node) {
-        if (checkIgnored(n)) {
-            return false;
-        }
-        if (n.nodeName === "BLOCKQUOTE") {
-            inQuote = true;
-        }
-
-        const newParts: Part[] = [];
-        if (lastNode && (checkBlockNode(lastNode) || checkBlockNode(n))) {
-            newParts.push(partCreator.newline());
-        }
-
-        if (n.nodeType === Node.TEXT_NODE) {
-            let { nodeValue } = n;
-
-            // Sometimes commonmark adds a newline at the end of the list item text
-            if (n.parentNode.nodeName === "LI") {
-                nodeValue = nodeValue.trimEnd();
-            }
-            newParts.push(...parseAtRoomMentions(nodeValue, partCreator));
-
-            const grandParent = n.parentNode.parentNode;
-            const isTight = n.parentNode.nodeName !== "P" || grandParent?.nodeName !== "LI";
-            if (!isTight) {
-                newParts.push(partCreator.newline());
-            }
-        } else if (n.nodeType === Node.ELEMENT_NODE) {
-            const parseResult = parseElement(n as HTMLElement, partCreator, lastNode, state);
-            if (parseResult) {
-                if (Array.isArray(parseResult)) {
-                    newParts.push(...parseResult);
-                } else {
-                    newParts.push(parseResult);
-                }
-            }
-        }
-
-        if (newParts.length && inQuote) {
-            const isFirstPart = parts.length === 0;
-            prefixQuoteLines(isFirstPart, newParts, partCreator);
-        }
-
-        parts.push(...newParts);
-
-        const descend = checkDescendInto(n);
-        // when not descending (like for PRE), onNodeLeave won't be called to set lastNode
-        // so do that here.
-        lastNode = descend ? null : n;
-        return descend;
+    const parts = parseNode(new DOMParser().parseFromString(html, "text/html").body, pc);
+    if (isQuotedMessage) {
+        prefixLines(parts, "> ", pc);
    }
-
-    function onNodeLeave(n: Node) {
-        if (checkIgnored(n)) {
-            return;
-        }
-        switch (n.nodeName) {
-            case "BLOCKQUOTE":
-                inQuote = false;
-                break;
-            case "OL":
-                state.listIndex.pop();
-                /* falls through */
-            case "UL":
-                state.listDepth -= 1;
-                break;
-        }
-        lastNode = n;
-    }
-
-    walkDOMDepthFirst(rootNode, onNodeEnter, onNodeLeave);
-
    return parts;
 }

-export function parsePlainTextMessage(body: string, partCreator: PartCreator, isQuotedMessage?: boolean): Part[] {
+export function parsePlainTextMessage(body: string, pc: PartCreator, isQuotedMessage?: boolean): Part[] {
    const lines = body.split(/\r\n|\r|\n/g); // split on any new-line combination not just \n, collapses \r\n
    return lines.reduce((parts, line, i) => {
        if (isQuotedMessage) {
-            parts.push(partCreator.plain(QUOTE_LINE_PREFIX));
+            parts.push(pc.plain("> "));
        }
-        parts.push(...parseAtRoomMentions(line, partCreator));
+        parts.push(...parseAtRoomMentions(line, pc));
        const isLast = i === lines.length - 1;
        if (!isLast) {
-            parts.push(partCreator.newline());
+            parts.push(pc.newline());
        }
        return parts;
    }, [] as Part[]);
 }

-export function parseEvent(event: MatrixEvent, partCreator: PartCreator, { isQuotedMessage = false } = {}) {
+export function parseEvent(event: MatrixEvent, pc: PartCreator, { isQuotedMessage = false } = {}) {
    const content = event.getContent();
    let parts: Part[];
    const isEmote = content.msgtype === "m.emote";
    let isRainbow = false;

    if (content.format === "org.matrix.custom.html") {
-        parts = parseHtmlMessage(content.formatted_body || "", partCreator, isQuotedMessage);
+        parts = parseHtmlMessage(content.formatted_body || "", pc, isQuotedMessage);
        if (content.body && content.formatted_body && textToHtmlRainbow(content.body) === content.formatted_body) {
            isRainbow = true;
        }
    } else {
-        parts = parsePlainTextMessage(content.body || "", partCreator, isQuotedMessage);
+        parts = parsePlainTextMessage(content.body || "", pc, isQuotedMessage);
    }

    if (isEmote && isRainbow) {
-        parts.unshift(partCreator.plain("/rainbowme "));
+        parts.unshift(pc.plain("/rainbowme "));
    } else if (isRainbow) {
-        parts.unshift(partCreator.plain("/rainbow "));
+        parts.unshift(pc.plain("/rainbow "));
    } else if (isEmote) {
-        parts.unshift(partCreator.plain("/me "));
+        parts.unshift(pc.plain("/me "));
    }

    return parts;
--- a/test/editor/snapshots/deserialize-test.js.snap
+++ b/test/editor/snapshots/deserialize-test.js.snap
@ -0,0 +1,178 @@
+// Jest Snapshot v1, https://goo.gl/fbAQLP
+
+exports[`editor/deserialize html messages escapes angle brackets 1`] = `
+Array [
+  Object {
+    "text": "\\\\> \\\\\\\\<del>no formatting here\\\\\\\\</del>",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages escapes asterisks 1`] = `
+Array [
+  Object {
+    "text": "\\\\*hello\\\\*",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages escapes backslashes 1`] = `
+Array [
+  Object {
+    "text": "C:\\\\\\\\My Documents",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages escapes backticks in code blocks 1`] = `
+Array [
+  Object {
+    "text": "\`\`this → \` is a backtick\`\`",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "\`\`\`\`",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "and here are 3 of them:",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "\`\`\`",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "\`\`\`\`",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages escapes backticks outside of code blocks 1`] = `
+Array [
+  Object {
+    "text": "some \\\\\`backticks\\\\\`",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages escapes square brackets 1`] = `
+Array [
+  Object {
+    "text": "\\\\[not an actual link\\\\](https://example.org)",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages escapes underscores 1`] = `
+Array [
+  Object {
+    "text": "\\\\_\\\\_emphasis\\\\_\\\\_",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages preserves nested formatting 1`] = `
+Array [
+  Object {
+    "text": "a<sub>b_c**d<u>e</u>**_</sub>",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages preserves nested quotes 1`] = `
+Array [
+  Object {
+    "text": "> foo",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "> ",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "> > bar",
+    "type": "plain",
+  },
+]
+`;
+
+exports[`editor/deserialize html messages surrounds lists with newlines 1`] = `
+Array [
+  Object {
+    "text": "foo",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "- bar",
+    "type": "plain",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "
+",
+    "type": "newline",
+  },
+  Object {
+    "text": "baz",
+    "type": "plain",
+  },
+]
+`;
--- a/test/editor/deserialize-test.js
+++ b/test/editor/deserialize-test.js
@ -237,18 +237,6 @@ describe('editor/deserialize', function() {
            expect(parts[3]).toStrictEqual({ type: "newline", text: "\n" });
            expect(parts[4]).toStrictEqual({ type: "plain", text: "3. Finish" });
        });
-        it('non tight lists', () => {
-            const html = "<ol><li><p>Start</p></li><li><p>Continue</p></li><li><p>Finish</p></li></ol>";
-            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
-            expect(parts.length).toBe(8);
-            expect(parts[0]).toStrictEqual({ type: "plain", text: "1. Start" });
-            expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });
-            expect(parts[2]).toStrictEqual({ type: "newline", text: "\n" });
-            expect(parts[3]).toStrictEqual({ type: "plain", text: "2. Continue" });
-            expect(parts[4]).toStrictEqual({ type: "newline", text: "\n" });
-            expect(parts[5]).toStrictEqual({ type: "newline", text: "\n" });
-            expect(parts[6]).toStrictEqual({ type: "plain", text: "3. Finish" });
-        });
        it('nested unordered lists', () => {
            const html = "<ul><li>Oak<ul><li>Spruce<ul><li>Birch</li></ul></li></ul></li></ul>";
            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
@ -269,13 +257,13 @@ describe('editor/deserialize', function() {
            expect(parts[3]).toStrictEqual({ type: "newline", text: "\n" });
            expect(parts[4]).toStrictEqual({ type: "plain", text: `${FOUR_SPACES.repeat(2)}1. Birch` });
        });
-        it('nested tight lists', () => {
+        it('nested lists', () => {
            const html = "<ol><li>Oak\n<ol><li>Spruce\n<ol><li>Birch</li></ol></li></ol></li></ol>";
            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
            expect(parts.length).toBe(5);
-            expect(parts[0]).toStrictEqual({ type: "plain", text: "1. Oak" });
+            expect(parts[0]).toStrictEqual({ type: "plain", text: "1. Oak\n" });
            expect(parts[1]).toStrictEqual({ type: "newline", text: "\n" });
-            expect(parts[2]).toStrictEqual({ type: "plain", text: `${FOUR_SPACES}1. Spruce` });
+            expect(parts[2]).toStrictEqual({ type: "plain", text: `${FOUR_SPACES}1. Spruce\n` });
            expect(parts[3]).toStrictEqual({ type: "newline", text: "\n" });
            expect(parts[4]).toStrictEqual({ type: "plain", text: `${FOUR_SPACES.repeat(2)}1. Birch` });
        });
@ -291,5 +279,56 @@ describe('editor/deserialize', function() {
            expect(parts.length).toBe(1);
            expect(parts[0]).toStrictEqual({ type: "plain", text: "/me says _DON'T SHOUT_!" });
        });
+        it('preserves nested quotes', () => {
+            const html = "<blockquote>foo<blockquote>bar</blockquote></blockquote>";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('surrounds lists with newlines', () => {
+            const html = "foo<ul><li>bar</li></ul>baz";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('preserves nested formatting', () => {
+            const html = "a<sub>b<em>c<strong>d<u>e</u></strong></em></sub>";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes backticks in code blocks', () => {
+            const html = "<p><code>this → ` is a backtick</code></p>" +
+                "<pre><code>and here are 3 of them:\n```</code></pre>";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes backticks outside of code blocks', () => {
+            const html = "some `backticks`";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes backslashes', () => {
+            const html = "C:\\My Documents";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes asterisks', () => {
+            const html = "*hello*";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes underscores', () => {
+            const html = "__emphasis__";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes square brackets', () => {
+            const html = "[not an actual link](https://example.org)";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
+        it('escapes angle brackets', () => {
+            const html = "> \\<del>no formatting here\\</del>";
+            const parts = normalize(parseEvent(htmlMessage(html), createPartCreator()));
+            expect(parts).toMatchSnapshot();
+        });
    });
 });