diff --git a/AGENTS.md b/AGENTS.md index 3eafcee68..48559bc32 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -246,9 +246,9 @@ Follow existing patterns in `mg-fs-utils` or `mg-tinynews`. ## HTML & XML Parsing -**Use `@tryghost/mg-utils` for all HTML and XML parsing. Do not use `cheerio`.** +**Use `@tryghost/mg-utils` for all HTML and XML parsing. Do not use `cheerio` or `jsdom`.** -See the [`mg-utils` README](packages/mg-utils/README.md) for full API documentation. +Powered by [linkedom](https://github.com/WebReflection/linkedom) — lightweight and memory-efficient. See the [`mg-utils` README](packages/mg-utils/README.md) for full API documentation. ```javascript import {xmlUtils, domUtils} from '@tryghost/mg-utils'; @@ -258,10 +258,17 @@ const parsed = await xmlUtils.parseXml(xmlString); const channel = parsed.rss.channel; const items = [].concat(channel.item || []); // normalize single/array -// HTML: parse, manipulate, serialize -const frag = domUtils.parseFragment(html); -frag.$('.unwanted').forEach(el => el.remove()); -const output = frag.html(); +// HTML: use processFragment for automatic cleanup +const output = domUtils.processFragment(html, (frag) => { + frag.$('.unwanted').forEach(el => el.remove()); + return frag.html(); +}); + +// Async version when the callback needs to await +const output = await domUtils.processFragmentAsync(html, async (frag) => { + // ... async operations ... + return frag.html(); +}); ``` ## Error Handling diff --git a/packages/mg-blogger/lib/process.js b/packages/mg-blogger/lib/process.js index af3d39aaa..af7411285 100644 --- a/packages/mg-blogger/lib/process.js +++ b/packages/mg-blogger/lib/process.js @@ -360,9 +360,13 @@ const processPosts = async (posts, options) => { // Filter out falsy items in the post list posts = posts.filter(i => i); - return Promise.all(posts.map((post) => { - return processPost(post, options); - })); + const results = []; + + for (let i = 0; i < posts.length; i++) { + results.push(await processPost(posts[i], options)); + } + + return results; }; const all = async (input, {options}) => { diff --git a/packages/mg-chorus/lib/processor.js b/packages/mg-chorus/lib/processor.js index 7f99e42be..8006e93b9 100644 --- a/packages/mg-chorus/lib/processor.js +++ b/packages/mg-chorus/lib/processor.js @@ -120,7 +120,14 @@ const processPost = (data, options) => { }; const processPosts = (posts, options) => { - return posts.map(post => processPost(post, options)); + const results = []; + for (let i = 0; i < posts.length; i++) { + const post = posts[i]; + if (post) { + results.push(processPost(post, options)); + } + } + return results; }; const all = ({result, options}) => { diff --git a/packages/mg-curated-export/lib/process.js b/packages/mg-curated-export/lib/process.js index 914d53d26..b72927418 100644 --- a/packages/mg-curated-export/lib/process.js +++ b/packages/mg-curated-export/lib/process.js @@ -32,7 +32,13 @@ export default (input, ctx) => { }); if (input.posts && input.posts.length > 0) { - output.posts = input.posts.map(post => processPost(post.json, globalUser, tags, ctx)); + output.posts = []; + for (let i = 0; i < input.posts.length; i++) { + const post = input.posts[i]; + if (post) { + output.posts.push(processPost(post.json, globalUser, tags, ctx)); + } + } } return output; diff --git a/packages/mg-ghost-api/lib/processor.js b/packages/mg-ghost-api/lib/processor.js index 161df21a7..231cc3085 100644 --- a/packages/mg-ghost-api/lib/processor.js +++ b/packages/mg-ghost-api/lib/processor.js @@ -47,7 +47,14 @@ const processPost = (ghPost) => { }; const processPosts = (posts) => { - return posts.map(post => processPost(post)); + const results = []; + for (let i = 0; i < posts.length; i++) { + const post = posts[i]; + if (post) { + results.push(processPost(post)); + } + } + return results; }; const processAuthor = (ghAuthor) => { diff --git a/packages/mg-jekyll-export/lib/process.js b/packages/mg-jekyll-export/lib/process.js index 1e4c3e096..37ab33dae 100644 --- a/packages/mg-jekyll-export/lib/process.js +++ b/packages/mg-jekyll-export/lib/process.js @@ -16,7 +16,13 @@ export default (input, options = {}) => { }; if (input.posts && input.posts.length > 0) { - output.posts = input.posts.map(post => processPost(post.fileName, post.fileContents, globalUser, options)); + output.posts = []; + for (let i = 0; i < input.posts.length; i++) { + const post = input.posts[i]; + if (post) { + output.posts.push(processPost(post.fileName, post.fileContents, globalUser, options)); + } + } } return output; diff --git a/packages/mg-letterdrop/lib/processor.js b/packages/mg-letterdrop/lib/processor.js index ea7c21031..ebc7584f5 100644 --- a/packages/mg-letterdrop/lib/processor.js +++ b/packages/mg-letterdrop/lib/processor.js @@ -168,7 +168,14 @@ const processPost = (data, options) => { }; const processPosts = (posts, options) => { - return posts.map(post => processPost(post, options)); + const results = []; + for (let i = 0; i < posts.length; i++) { + const post = posts[i]; + if (post) { + results.push(processPost(post, options)); + } + } + return results; }; const all = ({result, options}) => { diff --git a/packages/mg-letterdrop/test/processor.test.js b/packages/mg-letterdrop/test/processor.test.js index 3f7274faf..b627c9865 100644 --- a/packages/mg-letterdrop/test/processor.test.js +++ b/packages/mg-letterdrop/test/processor.test.js @@ -1,15 +1,18 @@ import assert from 'node:assert/strict'; +import {readFileSync} from 'node:fs'; +import {dirname, join} from 'node:path'; import {describe, it} from 'node:test'; -import {createRequire} from 'node:module'; +import {fileURLToPath} from 'node:url'; import processor from '../lib/processor.js'; -const require = createRequire(import.meta.url); -const fixture = require('./fixtures/api-response.json'); +const __dirname = dirname(fileURLToPath(import.meta.url)); +const fixtureData = JSON.parse(readFileSync(join(__dirname, 'fixtures/api-response.json'), 'utf8')); +const fixture = () => structuredClone(fixtureData); describe('Process', function () { it('Can convert a single post', function () { const ctx = { - result: fixture, + result: fixture(), options: { url: 'https://example.com', addPrimaryTag: 'Newsletter', @@ -68,7 +71,7 @@ describe('Process', function () { it('Converts signup iframes to Portal links', function () { const ctx = { - result: fixture, + result: fixture(), options: { url: 'https://example.com', addPrimaryTag: 'Newsletter', diff --git a/packages/mg-libsyn/lib/processor.js b/packages/mg-libsyn/lib/processor.js index b92e1d14c..ea8fbcba5 100644 --- a/packages/mg-libsyn/lib/processor.js +++ b/packages/mg-libsyn/lib/processor.js @@ -167,7 +167,14 @@ const processPost = (libsynPost, author, tags, options, errors) => { // eslint-d return post; }; const processPosts = (posts, author, tags, options, errors) => { // eslint-disable-line no-shadow - return posts.map(post => processPost(post, author, tags, options, errors)); + const results = []; + for (let i = 0; i < posts.length; i++) { + const post = posts[i]; + if (post) { + results.push(processPost(post, author, tags, options, errors)); + } + } + return results; }; const all = ({result, errors, options}) => { // eslint-disable-line no-shadow diff --git a/packages/mg-libsyn/test/processor.test.js b/packages/mg-libsyn/test/processor.test.js index eaefc8a18..b9585367e 100644 --- a/packages/mg-libsyn/test/processor.test.js +++ b/packages/mg-libsyn/test/processor.test.js @@ -1,10 +1,13 @@ import assert from 'node:assert/strict'; +import {readFileSync} from 'node:fs'; +import {dirname, join} from 'node:path'; import {describe, it} from 'node:test'; -import {createRequire} from 'node:module'; +import {fileURLToPath} from 'node:url'; import processor from '../lib/processor.js'; -const require = createRequire(import.meta.url); -const fixture = require('./fixtures/feed.json'); +const __dirname = dirname(fileURLToPath(import.meta.url)); +const fixtureData = JSON.parse(readFileSync(join(__dirname, 'fixtures/feed.json'), 'utf8')); +const fixture = () => structuredClone(fixtureData); describe('durationToSeconds', function () { it('Minutes with no seconds', function () { @@ -41,7 +44,7 @@ describe('durationToSeconds', function () { describe('Process posts', function () { it('Can process posts', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', @@ -61,7 +64,7 @@ describe('Process posts', function () { it('Post has required fields', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', @@ -118,7 +121,7 @@ describe('Process posts', function () { it('Can add a tag', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', @@ -152,7 +155,7 @@ describe('Process posts', function () { it('Can use feed categories', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, tags: ['Lorem', 'Ipsum', 'dolor'], author: { name: 'Test Author', @@ -200,7 +203,7 @@ describe('Process posts', function () { it('Can use item categories', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', @@ -245,7 +248,7 @@ describe('Process posts', function () { describe('Process content', function () { it('Remove empty p tags', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', @@ -268,7 +271,7 @@ describe('Process content', function () { it('Use Libsyn embeds', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', @@ -293,7 +296,7 @@ describe('Process content', function () { it('Use Audio cards', function () { const data = { - posts: fixture.rss.channel.item, + posts: fixture().rss.channel.item, author: { name: 'Test Author', slug: 'test-author', diff --git a/packages/mg-linkfixer/lib/LinkFixer.js b/packages/mg-linkfixer/lib/LinkFixer.js index ef4fe2a1e..9b7a4043d 100644 --- a/packages/mg-linkfixer/lib/LinkFixer.js +++ b/packages/mg-linkfixer/lib/LinkFixer.js @@ -2,7 +2,7 @@ import {join} from 'node:path'; import _ from 'lodash'; import {domUtils} from '@tryghost/mg-utils'; -const {parseFragment} = domUtils; +const {processFragment} = domUtils; // @TODO: expand this list const htmlFields = ['html']; @@ -130,24 +130,24 @@ export default class LinkFixer { } async processHTML(html) { - const parsed = parseFragment(html); + return processFragment(html, (parsed) => { + for (const el of parsed.$('a')) { + let href = el.getAttribute('href'); - for (const el of parsed.$('a')) { - let href = el.getAttribute('href'); + if (!href) { + continue; + } - if (!href) { - continue; - } - - // Clean the URL, matching the links stored in the linkMap - let updatedURL = this.cleanURL(href); + // Clean the URL, matching the links stored in the linkMap + let updatedURL = this.cleanURL(href); - if (this.linkMap[updatedURL]) { - el.setAttribute('href', this.linkMap[updatedURL]); + if (this.linkMap[updatedURL]) { + el.setAttribute('href', this.linkMap[updatedURL]); + } } - } - return parsed.html(); + return parsed.html(); + }); } async processLexical(lexical) { diff --git a/packages/mg-medium-export/lib/process-post.js b/packages/mg-medium-export/lib/process-post.js index 53126482a..d861486c3 100644 --- a/packages/mg-medium-export/lib/process-post.js +++ b/packages/mg-medium-export/lib/process-post.js @@ -83,102 +83,102 @@ const processTags = ({tagLinks}) => { }; const processFeatureImage = ({html, post, options}) => { - const parsed = domUtils.parseFragment(html); - - // Look for data-is-featured - let featured = parsed.$('[data-is-featured]')[0] || null; + return domUtils.processFragment(html, (parsed) => { + // Look for data-is-featured + let featured = parsed.$('[data-is-featured]')[0] || null; + + // Look for an image that appears before content + let allSections = parsed.$(sectionTags.join(',')); + let foundImg = false; + let preImageTags = []; + + allSections.forEach((el) => { + if (!foundImg) { + preImageTags.push(el.tagName.toLowerCase()); + } - // Look for an image that appears before content - let allSections = parsed.$(sectionTags.join(',')); - let foundImg = false; - let preImageTags = []; + if (!foundImg && el.tagName.toLowerCase() === 'img') { + foundImg = el; + } + }); - allSections.forEach((el) => { - if (!foundImg) { - preImageTags.push(el.tagName.toLowerCase()); + // We don't have a designated feature image, but there's an image above the content so use that image instead + if (!featured && !preImageTags.includes('p')) { + featured = foundImg; + + if (options?.addPlatformTag) { + // tag it with #auto-feature-image so we can tell the difference + post.data.tags.push({ + data: { + name: '#auto-feature-image' + } + }); + } } - if (!foundImg && el.tagName.toLowerCase() === 'img') { - foundImg = el; + if (featured) { + post.data.feature_image = featured.getAttribute('src'); + post.data.feature_image_alt = featured.getAttribute('alt') || null; + const figure = featured.closest('figure'); + const figcaption = figure ? figure.querySelector('figcaption') : null; + post.data.feature_image_caption = figcaption ? domUtils.serializeChildren(figcaption).trim() : null; + + if (figure) { + figure.remove(); + } } + + return parsed.html().trim(); }); +}; + +export default ({name, html, globalUser, options}) => { + return domUtils.processFragment(html, (parsed) => { + let post = processMeta({name, parsed, options}); + + // Process author + const pAuthor = parsed.$('.p-author')[0]; + if (pAuthor) { + post.data.author = processAuthor({pAuthor}); + } else if (globalUser) { + post.data.author = globalUser; + } - // We don't have a designated feature image, but there's an image above the content so use that image instead - if (!featured && !preImageTags.includes('p')) { - featured = foundImg; + post.data.tags = []; - if (options?.addPlatformTag) { - // tag it with #auto-feature-image so we can tell the difference + if (options?.addTag) { post.data.tags.push({ + url: 'migrator-added-tag', data: { - name: '#auto-feature-image' + name: options.addTag } }); } - } - if (featured) { - post.data.feature_image = featured.getAttribute('src'); - post.data.feature_image_alt = featured.getAttribute('alt') || null; - const figure = featured.closest('figure'); - const figcaption = figure ? figure.querySelector('figcaption') : null; - post.data.feature_image_caption = figcaption ? domUtils.serializeChildren(figcaption).trim() : null; - - if (figure) { - figure.remove(); + // Process tags + const tagLinks = parsed.$('.p-tags a'); + if (tagLinks.length) { + post.data.tags = [...post.data.tags, ...processTags({tagLinks})]; } - } - return parsed.html().trim(); -}; - -export default ({name, html, globalUser, options}) => { - const parsed = domUtils.parseFragment(html); - - let post = processMeta({name, parsed, options}); - - // Process author - const pAuthor = parsed.$('.p-author')[0]; - if (pAuthor) { - post.data.author = processAuthor({pAuthor}); - } else if (globalUser) { - post.data.author = globalUser; - } - - post.data.tags = []; - - if (options?.addTag) { - post.data.tags.push({ - url: 'migrator-added-tag', - data: { - name: options.addTag - } - }); - } - - // Process tags - const tagLinks = parsed.$('.p-tags a'); - if (tagLinks.length) { - post.data.tags = [...post.data.tags, ...processTags({tagLinks})]; - } - - if (options?.addPlatformTag) { - post.data.tags.push({ - url: 'migrator-added-platform-tag', - data: { - name: '#medium' - } - }); - } + if (options?.addPlatformTag) { + post.data.tags.push({ + url: 'migrator-added-platform-tag', + data: { + name: '#medium' + } + }); + } - // Process content - const eContent = parsed.$('.e-content')[0]; - const contentHtml = eContent ? domUtils.serializeChildren(eContent) : ''; - post = processContent({html: contentHtml, post}); + // Process content + const eContent = parsed.$('.e-content')[0]; + const contentHtml = eContent ? domUtils.serializeChildren(eContent) : ''; + post = processContent({html: contentHtml, post}); - // Grab the featured image - // Do this last so that we can add tags to indicate feature image style - post.data.html = processFeatureImage({html: post.data.html, post, options}); + // Grab the featured image + // Do this last so that we can add tags to indicate feature image style + post.data.html = processFeatureImage({html: post.data.html, post, options}); - return post; + return post; + }); }; diff --git a/packages/mg-medium-export/lib/process-profile.js b/packages/mg-medium-export/lib/process-profile.js index 1efb95bb9..24428f27c 100644 --- a/packages/mg-medium-export/lib/process-profile.js +++ b/packages/mg-medium-export/lib/process-profile.js @@ -1,4 +1,5 @@ import {domUtils} from '@tryghost/mg-utils'; +const {processFragment} = domUtils; // Keys we've seen so far // Profile @@ -22,26 +23,27 @@ const mediumToGhost = { }; export default ({html}) => { - const parsed = domUtils.parseFragment(html); - let profile = { - url: parsed.$('.u-url')[0]?.getAttribute('href'), - data: { - name: parsed.$('.p-name')[0]?.textContent || '', - profile_image: parsed.$('.u-photo')[0]?.getAttribute('src'), - roles: [ - 'Contributor' - ] - } - }; + return processFragment(html, (parsed) => { + let profile = { + url: parsed.$('.u-url')[0]?.getAttribute('href'), + data: { + name: parsed.$('.p-name')[0]?.textContent || '', + profile_image: parsed.$('.u-photo')[0]?.getAttribute('src'), + roles: [ + 'Contributor' + ] + } + }; - parsed.$('ul li').forEach((el) => { - let [item, value] = el.textContent.split(': '); - let key = mediumToGhost[item.toLowerCase()] || null; + parsed.$('ul li').forEach((el) => { + let [item, value] = el.textContent.split(': '); + let key = mediumToGhost[item.toLowerCase()] || null; - if (key) { - profile.data[key] = value; - } - }); + if (key) { + profile.data[key] = value; + } + }); - return profile; + return profile; + }); }; diff --git a/packages/mg-medium-export/lib/process.js b/packages/mg-medium-export/lib/process.js index 4bc16956a..902657f4e 100644 --- a/packages/mg-medium-export/lib/process.js +++ b/packages/mg-medium-export/lib/process.js @@ -11,7 +11,13 @@ export default (input, options) => { let globalUser = output.users && output.users.length === 1 ? output.users[0] : null; if (input.posts && input.posts.length > 0) { - output.posts = input.posts.map(post => processPost({name: post.name, html: post.html, globalUser, options})); + output.posts = []; + for (let i = 0; i < input.posts.length; i++) { + const post = input.posts[i]; + if (post) { + output.posts.push(processPost({name: post.name, html: post.html, globalUser, options})); + } + } } return output; diff --git a/packages/mg-squarespace-xml/lib/process.js b/packages/mg-squarespace-xml/lib/process.js index fcf0a18f1..73d7c7734 100644 --- a/packages/mg-squarespace-xml/lib/process.js +++ b/packages/mg-squarespace-xml/lib/process.js @@ -33,97 +33,91 @@ const processContent = (html, options) => { return ''; } - const parsed = domUtils.parseFragment(html); - - if (options?.removeSelectors) { - parsed.$(options.removeSelectors).forEach((el) => { - el.remove(); - }); - } + return domUtils.processFragment(html, (parsed) => { + if (options?.removeSelectors) { + parsed.$(options.removeSelectors).forEach((el) => { + el.remove(); + }); + } - parsed.$('.sqs-audio-embed').forEach((el) => { - let audioSrc = el.getAttribute('data-url'); - let audioTitle = el.getAttribute('data-title'); + parsed.$('.sqs-audio-embed').forEach((el) => { + let audioSrc = el.getAttribute('data-url'); + let audioTitle = el.getAttribute('data-title'); - let cardOpts = { - env: {dom: new SimpleDom.Document()}, - payload: { - src: audioSrc, - title: audioTitle - } - }; + let cardOpts = { + env: {dom: new SimpleDom.Document()}, + payload: { + src: audioSrc, + title: audioTitle + } + }; - const buildCard = audioCard.render(cardOpts); - const cardHTML = buildCard.nodeValue; + const buildCard = audioCard.render(cardOpts); + const cardHTML = buildCard.nodeValue; - domUtils.replaceWith(el, cardHTML); - }); + domUtils.replaceWith(el, cardHTML); + }); - parsed.$('.newsletter-form-wrapper').forEach((el) => { - el.remove(); - }); + parsed.$('.newsletter-form-wrapper').forEach((el) => { + el.remove(); + }); - // squarespace images without src - parsed.$('img[data-src]').forEach((img) => { - const src = img.getAttribute('data-src'); - if (img.classList.contains('thumb-image')) { - // images with the `thumb-image` class might be a duplicate - // to prevent migrating two images, we have to remove the false node - // Walk backwards to find the noscript sibling - let sibling = img.previousElementSibling; - while (sibling) { - if (sibling.tagName === 'NOSCRIPT') { - const noscriptImg = sibling.querySelector('img'); - if (noscriptImg && noscriptImg.getAttribute('src') === src) { - img.remove(); + // squarespace images without src + parsed.$('img[data-src]').forEach((img) => { + const src = img.getAttribute('data-src'); + if (img.classList.contains('thumb-image')) { + // images with the `thumb-image` class might be a duplicate + // to prevent migrating two images, we have to remove the false node + // Walk backwards to find the noscript sibling + let sibling = img.previousElementSibling; + while (sibling) { + if (sibling.tagName === 'NOSCRIPT') { + const noscriptImg = sibling.querySelector('img'); + if (noscriptImg && noscriptImg.getAttribute('src') === src) { + img.remove(); + } + break; } - break; + sibling = sibling.previousElementSibling; } - sibling = sibling.previousElementSibling; + } else { + img.setAttribute('src', img.getAttribute('data-src')); } - } else { - img.setAttribute('src', img.getAttribute('data-src')); - } - }); - - parsed.$('figure blockquote').forEach((el) => { - const nextSibling = el.nextElementSibling; - let captionText = ''; - if (nextSibling && nextSibling.tagName === 'FIGCAPTION') { - captionText = `

${domUtils.serializeChildren(nextSibling)}`; - nextSibling.remove(); - } - el.innerHTML = `

${domUtils.serializeChildren(el)}${captionText}

`; - }); - - parsed.$('.sqs-video-wrapper').forEach((el) => { - const theHtml = decode(el.getAttribute('data-html')); - const embedWrapper = el.closest('.embed-block-wrapper'); - const parent = embedWrapper ? embedWrapper.parentElement : null; + }); - if (parent) { - domUtils.replaceWith(parent, `
${theHtml}
`); - } - }); + parsed.$('figure blockquote').forEach((el) => { + const nextSibling = el.nextElementSibling; + let captionText = ''; + if (nextSibling && nextSibling.tagName === 'FIGCAPTION') { + captionText = `

${domUtils.serializeChildren(nextSibling)}`; + nextSibling.remove(); + } + el.innerHTML = `

${domUtils.serializeChildren(el)}${captionText}

`; + }); - // TODO: this should be a parser plugin - // Wrap nested lists in HTML card - parsed.$('ul li ul, ol li ol, ol li ul, ul li ol').forEach((nestedList) => { - // Walk up to the nearest parent ul/ol (equivalent to parentsUntil('ul, ol').parent()) - let topList = nestedList.parentElement?.closest('ul, ol'); - if (topList) { - domUtils.insertBefore(topList, ''); - domUtils.insertAfter(topList, ''); - } - }); + parsed.$('.sqs-video-wrapper').forEach((el) => { + const theHtml = decode(el.getAttribute('data-html')); + const embedWrapper = el.closest('.embed-block-wrapper'); + const parent = embedWrapper ? embedWrapper.parentElement : null; - // Convert HTML back to a string - html = parsed.html(); + if (parent) { + domUtils.replaceWith(parent, `
${theHtml}
`); + } + }); - // Trim whitespace - html = html.trim(); + // TODO: this should be a parser plugin + // Wrap nested lists in HTML card + parsed.$('ul li ul, ol li ol, ol li ul, ul li ol').forEach((nestedList) => { + // Walk up to the nearest parent ul/ol (equivalent to parentsUntil('ul, ol').parent()) + let topList = nestedList.parentElement?.closest('ul, ol'); + if (topList) { + domUtils.insertBefore(topList, ''); + domUtils.insertAfter(topList, ''); + } + }); - return html; + return parsed.html().trim(); + }); }; // The feature images is not "connected" to the post, other than it's located @@ -279,12 +273,17 @@ const processPost = (sqPost, index, items, users, options) => { const processPosts = (items, users, options) => { const postsOutput = []; - items.forEach((sqPost, index) => { - postsOutput.push(processPost(sqPost, index, items, users, options)); - }); + for (let i = 0; i < items.length; i++) { + const item = items[i]; + if (item) { + const result = processPost(item, i, items, users, options); + if (result) { + postsOutput.push(result); + } + } + } - // don't return empty post objects - return postsOutput.filter(post => post); + return postsOutput; }; const processUsers = (authors) => { diff --git a/packages/mg-substack/lib/process.js b/packages/mg-substack/lib/process.js index bde40f430..7ce96b1ce 100644 --- a/packages/mg-substack/lib/process.js +++ b/packages/mg-substack/lib/process.js @@ -862,9 +862,13 @@ export default async (input, ctx) => { } if (input.posts && input.posts.length > 0) { - output.posts = input.posts.map((post) => { - return processPost(post, siteUrl, options); - }); + output.posts = []; + for (let i = 0; i < input.posts.length; i++) { + const post = input.posts[i]; + if (post) { + output.posts.push(processPost(post, siteUrl, options)); + } + } } return output; diff --git a/packages/mg-utils/README.md b/packages/mg-utils/README.md index 97eacc2f3..094c785a4 100644 --- a/packages/mg-utils/README.md +++ b/packages/mg-utils/README.md @@ -17,42 +17,80 @@ or ### DOM Utilities -Parse and manipulate HTML fragments with proper HTML5 serialization: +Lightweight HTML parsing and manipulation powered by [linkedom](https://github.com/WebReflection/linkedom). Use `processFragment` for most cases — it parses HTML, passes the fragment to your callback, and automatically cleans up: ```js import {domUtils} from '@tryghost/mg-utils'; -const { - parseFragment, - serializeNode, - serializeChildren, - replaceWith, - insertBefore, - insertAfter, - wrap, - createElement, - attr, - is, - parents, - lastParent, - setStyle, - isComment, - getCommentData -} = domUtils; - -// Parse an HTML fragment -const parsed = parseFragment('

Hello World

'); - -// Query elements (returns array) -const paragraphs = parsed.$('p'); - -// Get serialized HTML -const html = parsed.html(); - -// Get text content -const text = parsed.text(); +const {processFragment, processFragmentAsync} = domUtils; + +// Parse, manipulate, and get the result in one step +const html = processFragment('

Hello

World

', (parsed) => { + for (const el of parsed.$('.remove')) { + el.remove(); + } + return parsed.html(); +}); +// => '

Hello

' + +// Extract data from HTML +const title = processFragment(rawHtml, parsed => parsed.$('h1')[0]?.textContent || ''); + +// Async version for callbacks that need to await +const result = await processFragmentAsync(html, async (parsed) => { + for (const img of parsed.$('img')) { + const newSrc = await processImage(img.getAttribute('src')); + img.setAttribute('src', newSrc); + } + return parsed.html(); +}); ``` +The `parsed` fragment provides: +- **`parsed.$(selector, context?)`** — query elements (returns `Element[]`) +- **`parsed.html()`** — serialize the fragment back to an HTML string +- **`parsed.text()`** — get text content +- **`parsed.document`** — access the underlying `Document` +- **`parsed.body`** — access the `` element + +For long-lived or complex processing where a callback doesn't fit, use `parseFragment` directly: + +```js +const {parseFragment} = domUtils; + +const parsed = parseFragment(html); +// ... extensive manipulation ... +const result = parsed.html(); +``` + +### DOM Manipulation Helpers + +```js +const {replaceWith, insertBefore, insertAfter, wrap, createElement, attr} = domUtils; + +const parsed = parseFragment('

Old

'); +const p = parsed.$('p')[0]; + +replaceWith(p, 'New'); // Replace element with HTML string or Node +insertBefore(el, ''); // Insert before element +insertAfter(el, ''); // Insert after element +wrap(el, '
'); // Wrap element in a new parent + +const div = createElement(parsed.document, 'div', {class: 'wrapper'}); + +attr(el, 'href'); // Get attribute (returns '' if missing) +attr(el, 'href', '/new-url'); // Set attribute +``` + +### Additional Element Utilities + +- **`is(el, selector)`** — check if element matches a CSS selector +- **`parents(el, selector?)`** — get all parent elements, optionally filtered +- **`lastParent(el, selector)`** — get the furthest parent matching selector +- **`setStyle(el, property, value)`** — set a CSS style property +- **`isComment(node)`** / **`getCommentData(node)`** — comment node helpers +- **`serializeNode(node)`** / **`serializeChildren(node)`** — HTML5-compliant serialization + ### XML Utilities Parse XML strings or files into JavaScript objects using `fast-xml-parser`: @@ -72,14 +110,6 @@ const data = await parseXml('/path/to/file.xml'); const data = await parseXml(xmlString, {attributeNamePrefix: ''}); ``` -### Key Features - -- **HTML5-compliant serialization**: Void elements (`
`, ``, `
`) are self-closing, non-void elements (`'); + assert.equal(processed, '
'); }); it('Can convert WP post embed', async function () { diff --git a/packages/mg-wp-api/utils/users-html-to-json.js b/packages/mg-wp-api/utils/users-html-to-json.js index 1f7b2ed48..af8f405a6 100644 --- a/packages/mg-wp-api/utils/users-html-to-json.js +++ b/packages/mg-wp-api/utils/users-html-to-json.js @@ -9,7 +9,7 @@ import {dirname, join} from 'node:path'; import {domUtils} from '@tryghost/mg-utils'; import {slugify} from '@tryghost/string'; -const {parseFragment} = domUtils; +const {processFragment} = domUtils; if (!process.argv[2]) { console.error('Please provide a path to the file'); // eslint-disable-line no-console @@ -21,45 +21,47 @@ const destPath = join(desitnationDir, 'users.json'); const html = readFileSync(process.argv[2], 'utf8'); -const parsed = parseFragment(html); +const users = processFragment(html, (parsed) => { + let result = []; -let users = []; + for (const el of parsed.$('tr[id^="user-"]')) { + const postsCell = el.querySelector('[data-colname="Posts"]'); + const postCount = parseInt(postsCell ? postsCell.textContent.trim() : '0'); -for (const el of parsed.$('tr[id^="user-"]')) { - const postsCell = el.querySelector('[data-colname="Posts"]'); - const postCount = parseInt(postsCell ? postsCell.textContent.trim() : '0'); + if (postCount === 0) { + continue; + } - if (postCount === 0) { - continue; - } + let id = parseInt(el.getAttribute('id').replace('user-', '')); + const emailCell = el.querySelector('[data-colname="Email"]'); + let email = emailCell ? emailCell.textContent.trim() : ''; + const nameCell = el.querySelector('[data-colname="Name"]'); + let name = nameCell ? nameCell.textContent.trim() : ''; + const usernameCell = el.querySelector('[data-colname="Username"]'); + const usernameStrong = usernameCell ? usernameCell.querySelector('strong') : null; + let username = usernameStrong ? usernameStrong.textContent.trim() : ''; + const usernameImg = usernameCell ? usernameCell.querySelector('img') : null; + let image = usernameImg ? usernameImg.getAttribute('src').trim().replace('s=64', 's=500') : ''; - let id = parseInt(el.getAttribute('id').replace('user-', '')); - const emailCell = el.querySelector('[data-colname="Email"]'); - let email = emailCell ? emailCell.textContent.trim() : ''; - const nameCell = el.querySelector('[data-colname="Name"]'); - let name = nameCell ? nameCell.textContent.trim() : ''; - const usernameCell = el.querySelector('[data-colname="Username"]'); - const usernameStrong = usernameCell ? usernameCell.querySelector('strong') : null; - let username = usernameStrong ? usernameStrong.textContent.trim() : ''; - const usernameImg = usernameCell ? usernameCell.querySelector('img') : null; - let image = usernameImg ? usernameImg.getAttribute('src').trim().replace('s=64', 's=500') : ''; + if (name.includes('—Unknown')) { + name = username; + } - if (name.includes('—Unknown')) { - name = username; - } + let slug = slugify(name); - let slug = slugify(name); + result.push({ + id, + slug, + name, + email, + avatar_urls: { + 96: image + } + }); + } - users.push({ - id, - slug, - name, - email, - avatar_urls: { - 96: image - } - }); -} + return result; +}); writeFileSync(destPath, JSON.stringify(users, null, 4)); console.log(`✅ File saved to: ${destPath}`); // eslint-disable-line no-console diff --git a/packages/mg-wp-xml/lib/process.js b/packages/mg-wp-xml/lib/process.js index 1b8d77546..27f4c2aa4 100644 --- a/packages/mg-wp-xml/lib/process.js +++ b/packages/mg-wp-xml/lib/process.js @@ -19,7 +19,7 @@ const fixSerializedLengths = (value) => { }); }; -const {parseFragment} = domUtils; +const {processFragment} = domUtils; const {getYouTubeID} = youtubeUtils; // XML Parser configuration @@ -239,17 +239,16 @@ const preProcessContent = async ({html, options}) => { // eslint-disable-line no // Join the separated lines html = splitIt.join('\n'); - const parsed = parseFragment(html); - - // Remove empty link elements, typically HTML anchors - for (const el of parsed.$('a')) { - if (el.innerHTML.length === 0) { - el.remove(); + html = processFragment(html, (parsed) => { + // Remove empty link elements, typically HTML anchors + for (const el of parsed.$('a')) { + if (el.innerHTML.length === 0) { + el.remove(); + } } - } - // convert HTML back to a string - html = parsed.html(); + return parsed.html(); + }); // Convert shortcodes here to that they don't accidently get wrapped in

tags by MarkdownIt html = await MgWpAPI.process.processShortcodes({html, options}); @@ -382,7 +381,7 @@ const processPost = async (post, users, options, fileCache) => { // Check for audio enclosure in post metadata and prepend audio card const metaData = await processWPMeta(post); const audioCardHTML = processEnclosureAudio(metaData); - const hasLibsynEmbed = parseFragment(postObj.data.html).$('iframe[src*="libsyn.com"]').length > 0; + const hasLibsynEmbed = processFragment(postObj.data.html, p => p.$('iframe[src*="libsyn.com"]').length > 0); if (audioCardHTML && !hasLibsynEmbed) { postObj.data.html = `${audioCardHTML}${postObj.data.html}`; } diff --git a/yarn.lock b/yarn.lock index 69b51c7aa..8c5afe098 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4631,16 +4631,6 @@ resolved "https://registry.yarnpkg.com/@types/istanbul-lib-coverage/-/istanbul-lib-coverage-2.0.6.tgz#7739c232a1fee9b4d3ce8985f314c0c6d33549d7" integrity sha512-2QF/t/auWm0lsy8XtKVPG19v3sSOQlJe/YHZgfjb/KBBHOGSV+J2q/S671rcq9uTBrLAXmZpqJiaQbMT+zNU1w== -"@types/jsdom@28.0.1": - version "28.0.1" - resolved "https://registry.yarnpkg.com/@types/jsdom/-/jsdom-28.0.1.tgz#2c014d8c0eca6135233519bff8c49f7aadfeda63" - integrity sha512-GJq2QE4TAZ5ajSoCasn5DOFm8u1mI3tIFvM5tIq3W5U/RTB6gsHwc6Yhpl91X9VSDOUVblgXmG+2+sSvFQrdlw== - dependencies: - "@types/node" "*" - "@types/tough-cookie" "*" - parse5 "^7.0.0" - undici-types "^7.21.0" - "@types/keyv@^3.1.4": version "3.1.4" resolved "https://registry.yarnpkg.com/@types/keyv/-/keyv-3.1.4.tgz#3ccdb1c6751b0c7e52300bcdacd5bcbf8faa75b6" @@ -4698,11 +4688,6 @@ dependencies: "@types/node" "*" -"@types/tough-cookie@*": - version "4.0.5" - resolved "https://registry.yarnpkg.com/@types/tough-cookie/-/tough-cookie-4.0.5.tgz#cb6e2a691b70cb177c6e3ae9c1d2e8b2ea8cd304" - integrity sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA== - "@types/unist@^2.0.0", "@types/unist@^2.0.2": version "2.0.11" resolved "https://registry.yarnpkg.com/@types/unist/-/unist-2.0.11.tgz#11af57b127e32487774841f7a4e54eab166d03c4" @@ -6239,6 +6224,11 @@ css-what@^6.1.0: resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.2.2.tgz#cdcc8f9b6977719fdfbd1de7aec24abf756b9dea" integrity sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA== +cssom@^0.5.0: + version "0.5.0" + resolved "https://registry.yarnpkg.com/cssom/-/cssom-0.5.0.tgz#d254fa92cd8b6fbd83811b9fbaed34663cc17c36" + integrity sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw== + csv-parse@5.4.0: version "5.4.0" resolved "https://registry.yarnpkg.com/csv-parse/-/csv-parse-5.4.0.tgz#6793210a4a49a9a74b3fde3f9d00f3f52044fd89" @@ -7850,6 +7840,11 @@ html-escaper@^2.0.0: resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== +html-escaper@^3.0.3: + version "3.0.3" + resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-3.0.3.tgz#4d336674652beb1dcbc29ef6b6ba7f6be6fdfed6" + integrity sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ== + html-minifier@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/html-minifier/-/html-minifier-4.0.0.tgz#cca9aad8bce1175e02e17a8c33e46d8988889f56" @@ -7879,7 +7874,7 @@ html-to-text@9.0.5: htmlparser2 "^8.0.2" selderee "^0.11.0" -htmlparser2@^10.1.0: +htmlparser2@^10.0.0, htmlparser2@^10.1.0: version "10.1.0" resolved "https://registry.yarnpkg.com/htmlparser2/-/htmlparser2-10.1.0.tgz#fe3f2e12c73b6e462d4e10395db9c1119e4d6ae4" integrity sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ== @@ -8453,7 +8448,7 @@ jsbn@~0.1.0: resolved "https://registry.yarnpkg.com/jsbn/-/jsbn-0.1.1.tgz#a5e654c2e5a2deb5f201d96cefbca80c0ef2f513" integrity sha512-UVU9dibq2JcFWxQPA6KCqj5O42VOmAY3zQUfEKxU0KpTGXwNoCjkX1e13eHNvw/xPynt6pU0rZ1htjWTNTSXsg== -jsdom@29.0.1, jsdom@^29.0.0: +jsdom@^29.0.0: version "29.0.1" resolved "https://registry.yarnpkg.com/jsdom/-/jsdom-29.0.1.tgz#b2db17191533dd5ba1e0d4c61fe9fa2289e87be9" integrity sha512-z6JOK5gRO7aMybVq/y/MlIpKh8JIi68FBKMUtKkK2KH/wMSRlCxQ682d08LB9fYXplyY/UXG8P4XXTScmdjApg== @@ -8669,6 +8664,17 @@ lines-and-columns@^1.1.6: resolved "https://registry.yarnpkg.com/lines-and-columns/-/lines-and-columns-1.2.4.tgz#eca284f75d2965079309dc0ad9255abb2ebc1632" integrity sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg== +linkedom@0.18.12: + version "0.18.12" + resolved "https://registry.yarnpkg.com/linkedom/-/linkedom-0.18.12.tgz#a8b1a1942b567dcb1888093df311055da1349a14" + integrity sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q== + dependencies: + css-select "^5.1.0" + cssom "^0.5.0" + html-escaper "^3.0.3" + htmlparser2 "^10.0.0" + uhyphen "^0.2.0" + linkify-it@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/linkify-it/-/linkify-it-5.0.0.tgz#9ef238bfa6dc70bd8e7f9572b52d369af569b421" @@ -11472,16 +11478,16 @@ uglify-js@^3.1.4, uglify-js@^3.5.1: resolved "https://registry.yarnpkg.com/uglify-js/-/uglify-js-3.19.3.tgz#82315e9bbc6f2b25888858acd1fff8441035b77f" integrity sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ== +uhyphen@^0.2.0: + version "0.2.0" + resolved "https://registry.yarnpkg.com/uhyphen/-/uhyphen-0.2.0.tgz#8fdf0623314486e020a3c00ee5cc7a12fe722b81" + integrity sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA== + uint8array-extras@^1.5.0: version "1.5.0" resolved "https://registry.yarnpkg.com/uint8array-extras/-/uint8array-extras-1.5.0.tgz#10d2a85213de3ada304fea1c454f635c73839e86" integrity sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A== -undici-types@^7.21.0: - version "7.24.6" - resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-7.24.6.tgz#61275b485d7fd4e9d269c7cf04ec2873c9cc0f91" - integrity sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg== - undici-types@~7.16.0: version "7.16.0" resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-7.16.0.tgz#ffccdff36aea4884cbfce9a750a0580224f58a46"