hexo/node_modules/html-to-text/lib/html-to-text.mjs

2148 lines
65 KiB
JavaScript

import { hp2Builder } from '@selderee/plugin-htmlparser2';
import { parseDocument } from 'htmlparser2';
import { DecisionTree } from 'selderee';
import merge from 'deepmerge';
import { render } from 'dom-serializer';
/**
* Make a recursive function that will only run to a given depth
* and switches to an alternative function at that depth. \
* No limitation if `n` is `undefined` (Just wraps `f` in that case).
*
* @param { number | undefined } n Allowed depth of recursion. `undefined` for no limitation.
* @param { Function } f Function that accepts recursive callback as the first argument.
* @param { Function } [g] Function to run instead, when maximum depth was reached. Do nothing by default.
* @returns { Function }
*/
function limitedDepthRecursive (n, f, g = () => undefined) {
if (n === undefined) {
const f1 = function (...args) { return f(f1, ...args); };
return f1;
}
if (n >= 0) {
return function (...args) { return f(limitedDepthRecursive(n - 1, f, g), ...args); };
}
return g;
}
/**
* Return the same string or a substring with
* the given character occurrences removed from each side.
*
* @param { string } str A string to trim.
* @param { string } char A character to be trimmed.
* @returns { string }
*/
function trimCharacter (str, char) {
let start = 0;
let end = str.length;
while (start < end && str[start] === char) { ++start; }
while (end > start && str[end - 1] === char) { --end; }
return (start > 0 || end < str.length)
? str.substring(start, end)
: str;
}
/**
* Return the same string or a substring with
* the given character occurrences removed from the end only.
*
* @param { string } str A string to trim.
* @param { string } char A character to be trimmed.
* @returns { string }
*/
function trimCharacterEnd (str, char) {
let end = str.length;
while (end > 0 && str[end - 1] === char) { --end; }
return (end < str.length)
? str.substring(0, end)
: str;
}
/**
* Return a new string will all characters replaced with unicode escape sequences.
* This extreme kind of escaping can used to be safely compose regular expressions.
*
* @param { string } str A string to escape.
* @returns { string } A string of unicode escape sequences.
*/
function unicodeEscape (str) {
return str.replace(/[\s\S]/g, c => '\\u' + c.charCodeAt().toString(16).padStart(4, '0'));
}
/**
* Deduplicate an array by a given key callback.
* Item properties are merged recursively and with the preference for last defined values.
* Of items with the same key, merged item takes the place of the last item,
* others are omitted.
*
* @param { any[] } items An array to deduplicate.
* @param { (x: any) => string } getKey Callback to get a value that distinguishes unique items.
* @returns { any[] }
*/
function mergeDuplicatesPreferLast (items, getKey) {
const map = new Map();
for (let i = items.length; i-- > 0;) {
const item = items[i];
const key = getKey(item);
map.set(
key,
(map.has(key))
? merge(item, map.get(key), { arrayMerge: overwriteMerge$1 })
: item
);
}
return [...map.values()].reverse();
}
const overwriteMerge$1 = (acc, src, options) => [...src];
/**
* Get a nested property from an object.
*
* @param { object } obj The object to query for the value.
* @param { string[] } path The path to the property.
* @returns { any }
*/
function get (obj, path) {
for (const key of path) {
if (!obj) { return undefined; }
obj = obj[key];
}
return obj;
}
/**
* Convert a number into alphabetic sequence representation (Sequence without zeroes).
*
* For example: `a, ..., z, aa, ..., zz, aaa, ...`.
*
* @param { number } num Number to convert. Must be >= 1.
* @param { string } [baseChar = 'a'] Character for 1 in the sequence.
* @param { number } [base = 26] Number of characters in the sequence.
* @returns { string }
*/
function numberToLetterSequence (num, baseChar = 'a', base = 26) {
const digits = [];
do {
num -= 1;
digits.push(num % base);
num = (num / base) >> 0; // quick `floor`
} while (num > 0);
const baseCode = baseChar.charCodeAt(0);
return digits
.reverse()
.map(n => String.fromCharCode(baseCode + n))
.join('');
}
const I = ['I', 'X', 'C', 'M'];
const V = ['V', 'L', 'D'];
/**
* Convert a number to it's Roman representation. No large numbers extension.
*
* @param { number } num Number to convert. `0 < num <= 3999`.
* @returns { string }
*/
function numberToRoman (num) {
return [...(num) + '']
.map(n => +n)
.reverse()
.map((v, i) => ((v % 5 < 4)
? (v < 5 ? '' : V[i]) + I[i].repeat(v % 5)
: I[i] + (v < 5 ? V[i] : I[i + 1])))
.reverse()
.join('');
}
/**
* Helps to build text from words.
*/
class InlineTextBuilder {
/**
* Creates an instance of InlineTextBuilder.
*
* If `maxLineLength` is not provided then it is either `options.wordwrap` or unlimited.
*
* @param { Options } options HtmlToText options.
* @param { number } [ maxLineLength ] This builder will try to wrap text to fit this line length.
*/
constructor (options, maxLineLength = undefined) {
/** @type { string[][] } */
this.lines = [];
/** @type { string[] } */
this.nextLineWords = [];
this.maxLineLength = maxLineLength || options.wordwrap || Number.MAX_VALUE;
this.nextLineAvailableChars = this.maxLineLength;
this.wrapCharacters = get(options, ['longWordSplit', 'wrapCharacters']) || [];
this.forceWrapOnLimit = get(options, ['longWordSplit', 'forceWrapOnLimit']) || false;
this.stashedSpace = false;
this.wordBreakOpportunity = false;
}
/**
* Add a new word.
*
* @param { string } word A word to add.
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
*/
pushWord (word, noWrap = false) {
if (this.nextLineAvailableChars <= 0 && !noWrap) {
this.startNewLine();
}
const isLineStart = this.nextLineWords.length === 0;
const cost = word.length + (isLineStart ? 0 : 1);
if ((cost <= this.nextLineAvailableChars) || noWrap) { // Fits into available budget
this.nextLineWords.push(word);
this.nextLineAvailableChars -= cost;
} else { // Does not fit - try to split the word
// The word is moved to a new line - prefer to wrap between words.
const [first, ...rest] = this.splitLongWord(word);
if (!isLineStart) { this.startNewLine(); }
this.nextLineWords.push(first);
this.nextLineAvailableChars -= first.length;
for (const part of rest) {
this.startNewLine();
this.nextLineWords.push(part);
this.nextLineAvailableChars -= part.length;
}
}
}
/**
* Pop a word from the currently built line.
* This doesn't affect completed lines.
*
* @returns { string }
*/
popWord () {
const lastWord = this.nextLineWords.pop();
if (lastWord !== undefined) {
const isLineStart = this.nextLineWords.length === 0;
const cost = lastWord.length + (isLineStart ? 0 : 1);
this.nextLineAvailableChars += cost;
}
return lastWord;
}
/**
* Concat a word to the last word already in the builder.
* Adds a new word in case there are no words yet in the last line.
*
* @param { string } word A word to be concatenated.
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
*/
concatWord (word, noWrap = false) {
if (this.wordBreakOpportunity && word.length > this.nextLineAvailableChars) {
this.pushWord(word, noWrap);
this.wordBreakOpportunity = false;
} else {
const lastWord = this.popWord();
this.pushWord((lastWord) ? lastWord.concat(word) : word, noWrap);
}
}
/**
* Add current line (and more empty lines if provided argument > 1) to the list of complete lines and start a new one.
*
* @param { number } n Number of line breaks that will be added to the resulting string.
*/
startNewLine (n = 1) {
this.lines.push(this.nextLineWords);
if (n > 1) {
this.lines.push(...Array.from({ length: n - 1 }, () => []));
}
this.nextLineWords = [];
this.nextLineAvailableChars = this.maxLineLength;
}
/**
* No words in this builder.
*
* @returns { boolean }
*/
isEmpty () {
return this.lines.length === 0
&& this.nextLineWords.length === 0;
}
clear () {
this.lines.length = 0;
this.nextLineWords.length = 0;
this.nextLineAvailableChars = this.maxLineLength;
}
/**
* Join all lines of words inside the InlineTextBuilder into a complete string.
*
* @returns { string }
*/
toString () {
return [...this.lines, this.nextLineWords]
.map(words => words.join(' '))
.join('\n');
}
/**
* Split a long word up to fit within the word wrap limit.
* Use either a character to split looking back from the word wrap limit,
* or truncate to the word wrap limit.
*
* @param { string } word Input word.
* @returns { string[] } Parts of the word.
*/
splitLongWord (word) {
const parts = [];
let idx = 0;
while (word.length > this.maxLineLength) {
const firstLine = word.substring(0, this.maxLineLength);
const remainingChars = word.substring(this.maxLineLength);
const splitIndex = firstLine.lastIndexOf(this.wrapCharacters[idx]);
if (splitIndex > -1) { // Found a character to split on
word = firstLine.substring(splitIndex + 1) + remainingChars;
parts.push(firstLine.substring(0, splitIndex + 1));
} else { // Not found a character to split on
idx++;
if (idx < this.wrapCharacters.length) { // There is next character to try
word = firstLine + remainingChars;
} else { // No more characters to try
if (this.forceWrapOnLimit) {
parts.push(firstLine);
word = remainingChars;
if (word.length > this.maxLineLength) {
continue;
}
} else {
word = firstLine + remainingChars;
}
break;
}
}
}
parts.push(word); // Add remaining part to array
return parts;
}
}
/* eslint-disable max-classes-per-file */
class StackItem {
constructor (next = null) { this.next = next; }
getRoot () { return (this.next) ? this.next : this; }
}
class BlockStackItem extends StackItem {
constructor (options, next = null, leadingLineBreaks = 1, maxLineLength = undefined) {
super(next);
this.leadingLineBreaks = leadingLineBreaks;
this.inlineTextBuilder = new InlineTextBuilder(options, maxLineLength);
this.rawText = '';
this.stashedLineBreaks = 0;
this.isPre = next && next.isPre;
this.isNoWrap = next && next.isNoWrap;
}
}
class ListStackItem extends BlockStackItem {
constructor (
options,
next = null,
{
interRowLineBreaks = 1,
leadingLineBreaks = 2,
maxLineLength = undefined,
maxPrefixLength = 0,
prefixAlign = 'left',
} = {}
) {
super(options, next, leadingLineBreaks, maxLineLength);
this.maxPrefixLength = maxPrefixLength;
this.prefixAlign = prefixAlign;
this.interRowLineBreaks = interRowLineBreaks;
}
}
class ListItemStackItem extends BlockStackItem {
constructor (
options,
next = null,
{
leadingLineBreaks = 1,
maxLineLength = undefined,
prefix = '',
} = {}
) {
super(options, next, leadingLineBreaks, maxLineLength);
this.prefix = prefix;
}
}
class TableStackItem extends StackItem {
constructor (next = null) {
super(next);
this.rows = [];
this.isPre = next && next.isPre;
this.isNoWrap = next && next.isNoWrap;
}
}
class TableRowStackItem extends StackItem {
constructor (next = null) {
super(next);
this.cells = [];
this.isPre = next && next.isPre;
this.isNoWrap = next && next.isNoWrap;
}
}
class TableCellStackItem extends StackItem {
constructor (options, next = null, maxColumnWidth = undefined) {
super(next);
this.inlineTextBuilder = new InlineTextBuilder(options, maxColumnWidth);
this.rawText = '';
this.stashedLineBreaks = 0;
this.isPre = next && next.isPre;
this.isNoWrap = next && next.isNoWrap;
}
}
class TransformerStackItem extends StackItem {
constructor (next = null, transform) {
super(next);
this.transform = transform;
}
}
function charactersToCodes (str) {
return [...str]
.map(c => '\\u' + c.charCodeAt(0).toString(16).padStart(4, '0'))
.join('');
}
/**
* Helps to handle HTML whitespaces.
*
* @class WhitespaceProcessor
*/
class WhitespaceProcessor {
/**
* Creates an instance of WhitespaceProcessor.
*
* @param { Options } options HtmlToText options.
* @memberof WhitespaceProcessor
*/
constructor (options) {
this.whitespaceChars = (options.preserveNewlines)
? options.whitespaceCharacters.replace(/\n/g, '')
: options.whitespaceCharacters;
const whitespaceCodes = charactersToCodes(this.whitespaceChars);
this.leadingWhitespaceRe = new RegExp(`^[${whitespaceCodes}]`);
this.trailingWhitespaceRe = new RegExp(`[${whitespaceCodes}]$`);
this.allWhitespaceOrEmptyRe = new RegExp(`^[${whitespaceCodes}]*$`);
this.newlineOrNonWhitespaceRe = new RegExp(`(\\n|[^\\n${whitespaceCodes}])`, 'g');
this.newlineOrNonNewlineStringRe = new RegExp(`(\\n|[^\\n]+)`, 'g');
if (options.preserveNewlines) {
const wordOrNewlineRe = new RegExp(`\\n|[^\\n${whitespaceCodes}]+`, 'gm');
/**
* Shrink whitespaces and wrap text, add to the builder.
*
* @param { string } text Input text.
* @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text.
* @param { (str: string) => string } [ transform ] A transform to be applied to words.
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
*/
this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) {
if (!text) { return; }
const previouslyStashedSpace = inlineTextBuilder.stashedSpace;
let anyMatch = false;
let m = wordOrNewlineRe.exec(text);
if (m) {
anyMatch = true;
if (m[0] === '\n') {
inlineTextBuilder.startNewLine();
} else if (previouslyStashedSpace || this.testLeadingWhitespace(text)) {
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
} else {
inlineTextBuilder.concatWord(transform(m[0]), noWrap);
}
while ((m = wordOrNewlineRe.exec(text)) !== null) {
if (m[0] === '\n') {
inlineTextBuilder.startNewLine();
} else {
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
}
}
}
inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || (this.testTrailingWhitespace(text));
// No need to stash a space in case last added item was a new line,
// but that won't affect anything later anyway.
};
} else {
const wordRe = new RegExp(`[^${whitespaceCodes}]+`, 'g');
this.shrinkWrapAdd = function (text, inlineTextBuilder, transform = (str => str), noWrap = false) {
if (!text) { return; }
const previouslyStashedSpace = inlineTextBuilder.stashedSpace;
let anyMatch = false;
let m = wordRe.exec(text);
if (m) {
anyMatch = true;
if (previouslyStashedSpace || this.testLeadingWhitespace(text)) {
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
} else {
inlineTextBuilder.concatWord(transform(m[0]), noWrap);
}
while ((m = wordRe.exec(text)) !== null) {
inlineTextBuilder.pushWord(transform(m[0]), noWrap);
}
}
inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch) || this.testTrailingWhitespace(text);
};
}
}
/**
* Add text with only minimal processing.
* Everything between newlines considered a single word.
* No whitespace is trimmed.
* Not affected by preserveNewlines option - `\n` always starts a new line.
*
* `noWrap` argument is `true` by default - this won't start a new line
* even if there is not enough space left in the current line.
*
* @param { string } text Input text.
* @param { InlineTextBuilder } inlineTextBuilder A builder to receive processed text.
* @param { boolean } [noWrap] Don't wrap text even if the line is too long.
*/
addLiteral (text, inlineTextBuilder, noWrap = true) {
if (!text) { return; }
const previouslyStashedSpace = inlineTextBuilder.stashedSpace;
let anyMatch = false;
let m = this.newlineOrNonNewlineStringRe.exec(text);
if (m) {
anyMatch = true;
if (m[0] === '\n') {
inlineTextBuilder.startNewLine();
} else if (previouslyStashedSpace) {
inlineTextBuilder.pushWord(m[0], noWrap);
} else {
inlineTextBuilder.concatWord(m[0], noWrap);
}
while ((m = this.newlineOrNonNewlineStringRe.exec(text)) !== null) {
if (m[0] === '\n') {
inlineTextBuilder.startNewLine();
} else {
inlineTextBuilder.pushWord(m[0], noWrap);
}
}
}
inlineTextBuilder.stashedSpace = (previouslyStashedSpace && !anyMatch);
}
/**
* Test whether the given text starts with HTML whitespace character.
*
* @param { string } text The string to test.
* @returns { boolean }
*/
testLeadingWhitespace (text) {
return this.leadingWhitespaceRe.test(text);
}
/**
* Test whether the given text ends with HTML whitespace character.
*
* @param { string } text The string to test.
* @returns { boolean }
*/
testTrailingWhitespace (text) {
return this.trailingWhitespaceRe.test(text);
}
/**
* Test whether the given text contains any non-whitespace characters.
*
* @param { string } text The string to test.
* @returns { boolean }
*/
testContainsWords (text) {
return !this.allWhitespaceOrEmptyRe.test(text);
}
/**
* Return the number of newlines if there are no words.
*
* If any word is found then return zero regardless of the actual number of newlines.
*
* @param { string } text Input string.
* @returns { number }
*/
countNewlinesNoWords (text) {
this.newlineOrNonWhitespaceRe.lastIndex = 0;
let counter = 0;
let match;
while ((match = this.newlineOrNonWhitespaceRe.exec(text)) !== null) {
if (match[0] === '\n') {
counter++;
} else {
return 0;
}
}
return counter;
}
}
/**
* Helps to build text from inline and block elements.
*
* @class BlockTextBuilder
*/
class BlockTextBuilder {
/**
* Creates an instance of BlockTextBuilder.
*
* @param { Options } options HtmlToText options.
* @param { import('selderee').Picker<DomNode, TagDefinition> } picker Selectors decision tree picker.
* @param { any} [metadata] Optional metadata for HTML document, for use in formatters.
*/
constructor (options, picker, metadata = undefined) {
this.options = options;
this.picker = picker;
this.metadata = metadata;
this.whitespaceProcessor = new WhitespaceProcessor(options);
/** @type { StackItem } */
this._stackItem = new BlockStackItem(options);
/** @type { TransformerStackItem } */
this._wordTransformer = undefined;
}
/**
* Put a word-by-word transform function onto the transformations stack.
*
* Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs.
*
* Word transformations applied before wrapping.
*
* @param { (str: string) => string } wordTransform Word transformation function.
*/
pushWordTransform (wordTransform) {
this._wordTransformer = new TransformerStackItem(this._wordTransformer, wordTransform);
}
/**
* Remove a function from the word transformations stack.
*
* @returns { (str: string) => string } A function that was removed.
*/
popWordTransform () {
if (!this._wordTransformer) { return undefined; }
const transform = this._wordTransformer.transform;
this._wordTransformer = this._wordTransformer.next;
return transform;
}
/**
* Ignore wordwrap option in followup inline additions and disable automatic wrapping.
*/
startNoWrap () {
this._stackItem.isNoWrap = true;
}
/**
* Return automatic wrapping to behavior defined by options.
*/
stopNoWrap () {
this._stackItem.isNoWrap = false;
}
/** @returns { (str: string) => string } */
_getCombinedWordTransformer () {
const wt = (this._wordTransformer)
? ((str) => applyTransformer(str, this._wordTransformer))
: undefined;
const ce = this.options.encodeCharacters;
return (wt)
? ((ce) ? (str) => ce(wt(str)) : wt)
: ce;
}
_popStackItem () {
const item = this._stackItem;
this._stackItem = item.next;
return item;
}
/**
* Add a line break into currently built block.
*/
addLineBreak () {
if (!(
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof ListItemStackItem
|| this._stackItem instanceof TableCellStackItem
)) { return; }
if (this._stackItem.isPre) {
this._stackItem.rawText += '\n';
} else {
this._stackItem.inlineTextBuilder.startNewLine();
}
}
/**
* Allow to break line in case directly following text will not fit.
*/
addWordBreakOpportunity () {
if (
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof ListItemStackItem
|| this._stackItem instanceof TableCellStackItem
) {
this._stackItem.inlineTextBuilder.wordBreakOpportunity = true;
}
}
/**
* Add a node inline into the currently built block.
*
* @param { string } str
* Text content of a node to add.
*
* @param { object } [param1]
* Object holding the parameters of the operation.
*
* @param { boolean } [param1.noWordTransform]
* Ignore word transformers if there are any.
* Don't encode characters as well.
* (Use this for things like URL addresses).
*/
addInline (str, { noWordTransform = false } = {}) {
if (!(
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof ListItemStackItem
|| this._stackItem instanceof TableCellStackItem
)) { return; }
if (this._stackItem.isPre) {
this._stackItem.rawText += str;
return;
}
if (
str.length === 0 || // empty string
(
this._stackItem.stashedLineBreaks && // stashed linebreaks make whitespace irrelevant
!this.whitespaceProcessor.testContainsWords(str) // no words to add
)
) { return; }
if (this.options.preserveNewlines) {
const newlinesNumber = this.whitespaceProcessor.countNewlinesNoWords(str);
if (newlinesNumber > 0) {
this._stackItem.inlineTextBuilder.startNewLine(newlinesNumber);
// keep stashedLineBreaks unchanged
return;
}
}
if (this._stackItem.stashedLineBreaks) {
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks);
}
this.whitespaceProcessor.shrinkWrapAdd(
str,
this._stackItem.inlineTextBuilder,
(noWordTransform) ? undefined : this._getCombinedWordTransformer(),
this._stackItem.isNoWrap
);
this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks
}
/**
* Add a string inline into the currently built block.
*
* Use this for markup elements that don't have to adhere
* to text layout rules.
*
* @param { string } str Text to add.
*/
addLiteral (str) {
if (!(
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof ListItemStackItem
|| this._stackItem instanceof TableCellStackItem
)) { return; }
if (str.length === 0) { return; }
if (this._stackItem.isPre) {
this._stackItem.rawText += str;
return;
}
if (this._stackItem.stashedLineBreaks) {
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks);
}
this.whitespaceProcessor.addLiteral(
str,
this._stackItem.inlineTextBuilder,
this._stackItem.isNoWrap
);
this._stackItem.stashedLineBreaks = 0;
}
/**
* Start building a new block.
*
* @param { object } [param0]
* Object holding the parameters of the block.
*
* @param { number } [param0.leadingLineBreaks]
* This block should have at least this number of line breaks to separate it from any preceding block.
*
* @param { number } [param0.reservedLineLength]
* Reserve this number of characters on each line for block markup.
*
* @param { boolean } [param0.isPre]
* Should HTML whitespace be preserved inside this block.
*/
openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) {
const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength);
this._stackItem = new BlockStackItem(
this.options,
this._stackItem,
leadingLineBreaks,
maxLineLength
);
if (isPre) { this._stackItem.isPre = true; }
}
/**
* Finalize currently built block, add it's content to the parent block.
*
* @param { object } [param0]
* Object holding the parameters of the block.
*
* @param { number } [param0.trailingLineBreaks]
* This block should have at least this number of line breaks to separate it from any following block.
*
* @param { (str: string) => string } [param0.blockTransform]
* A function to transform the block text before adding to the parent block.
* This happens after word wrap and should be used in combination with reserved line length
* in order to keep line lengths correct.
* Used for whole block markup.
*/
closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) {
const block = this._popStackItem();
const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block);
addText(this._stackItem, blockText, block.leadingLineBreaks, Math.max(block.stashedLineBreaks, trailingLineBreaks));
}
/**
* Start building a new list.
*
* @param { object } [param0]
* Object holding the parameters of the list.
*
* @param { number } [param0.maxPrefixLength]
* Length of the longest list item prefix.
* If not supplied or too small then list items won't be aligned properly.
*
* @param { 'left' | 'right' } [param0.prefixAlign]
* Specify how prefixes of different lengths have to be aligned
* within a column.
*
* @param { number } [param0.interRowLineBreaks]
* Minimum number of line breaks between list items.
*
* @param { number } [param0.leadingLineBreaks]
* This list should have at least this number of line breaks to separate it from any preceding block.
*/
openList ({ maxPrefixLength = 0, prefixAlign = 'left', interRowLineBreaks = 1, leadingLineBreaks = 2 } = {}) {
this._stackItem = new ListStackItem(this.options, this._stackItem, {
interRowLineBreaks: interRowLineBreaks,
leadingLineBreaks: leadingLineBreaks,
maxLineLength: this._stackItem.inlineTextBuilder.maxLineLength,
maxPrefixLength: maxPrefixLength,
prefixAlign: prefixAlign
});
}
/**
* Start building a new list item.
*
* @param {object} param0
* Object holding the parameters of the list item.
*
* @param { string } [param0.prefix]
* Prefix for this list item (item number, bullet point, etc).
*/
openListItem ({ prefix = '' } = {}) {
if (!(this._stackItem instanceof ListStackItem)) {
throw new Error('Can\'t add a list item to something that is not a list! Check the formatter.');
}
const list = this._stackItem;
const prefixLength = Math.max(prefix.length, list.maxPrefixLength);
const maxLineLength = Math.max(20, list.inlineTextBuilder.maxLineLength - prefixLength);
this._stackItem = new ListItemStackItem(this.options, list, {
prefix: prefix,
maxLineLength: maxLineLength,
leadingLineBreaks: list.interRowLineBreaks
});
}
/**
* Finalize currently built list item, add it's content to the parent list.
*/
closeListItem () {
const listItem = this._popStackItem();
const list = listItem.next;
const prefixLength = Math.max(listItem.prefix.length, list.maxPrefixLength);
const spacing = '\n' + ' '.repeat(prefixLength);
const prefix = (list.prefixAlign === 'right')
? listItem.prefix.padStart(prefixLength)
: listItem.prefix.padEnd(prefixLength);
const text = prefix + getText(listItem).replace(/\n/g, spacing);
addText(
list,
text,
listItem.leadingLineBreaks,
Math.max(listItem.stashedLineBreaks, list.interRowLineBreaks)
);
}
/**
* Finalize currently built list, add it's content to the parent block.
*
* @param { object } param0
* Object holding the parameters of the list.
*
* @param { number } [param0.trailingLineBreaks]
* This list should have at least this number of line breaks to separate it from any following block.
*/
closeList ({ trailingLineBreaks = 2 } = {}) {
const list = this._popStackItem();
const text = getText(list);
if (text) {
addText(this._stackItem, text, list.leadingLineBreaks, trailingLineBreaks);
}
}
/**
* Start building a table.
*/
openTable () {
this._stackItem = new TableStackItem(this._stackItem);
}
/**
* Start building a table row.
*/
openTableRow () {
if (!(this._stackItem instanceof TableStackItem)) {
throw new Error('Can\'t add a table row to something that is not a table! Check the formatter.');
}
this._stackItem = new TableRowStackItem(this._stackItem);
}
/**
* Start building a table cell.
*
* @param { object } [param0]
* Object holding the parameters of the cell.
*
* @param { number } [param0.maxColumnWidth]
* Wrap cell content to this width. Fall back to global wordwrap value if undefined.
*/
openTableCell ({ maxColumnWidth = undefined } = {}) {
if (!(this._stackItem instanceof TableRowStackItem)) {
throw new Error('Can\'t add a table cell to something that is not a table row! Check the formatter.');
}
this._stackItem = new TableCellStackItem(this.options, this._stackItem, maxColumnWidth);
}
/**
* Finalize currently built table cell and add it to parent table row's cells.
*
* @param { object } [param0]
* Object holding the parameters of the cell.
*
* @param { number } [param0.colspan] How many columns this cell should occupy.
* @param { number } [param0.rowspan] How many rows this cell should occupy.
*/
closeTableCell ({ colspan = 1, rowspan = 1 } = {}) {
const cell = this._popStackItem();
const text = trimCharacter(getText(cell), '\n');
cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text });
}
/**
* Finalize currently built table row and add it to parent table's rows.
*/
closeTableRow () {
const row = this._popStackItem();
row.next.rows.push(row.cells);
}
/**
* Finalize currently built table and add the rendered text to the parent block.
*
* @param { object } param0
* Object holding the parameters of the table.
*
* @param { TablePrinter } param0.tableToString
* A function to convert a table of stringified cells into a complete table.
*
* @param { number } [param0.leadingLineBreaks]
* This table should have at least this number of line breaks to separate if from any preceding block.
*
* @param { number } [param0.trailingLineBreaks]
* This table should have at least this number of line breaks to separate it from any following block.
*/
closeTable ({ tableToString, leadingLineBreaks = 2, trailingLineBreaks = 2 }) {
const table = this._popStackItem();
const output = tableToString(table.rows);
if (output) {
addText(this._stackItem, output, leadingLineBreaks, trailingLineBreaks);
}
}
/**
* Return the rendered text content of this builder.
*
* @returns { string }
*/
toString () {
return getText(this._stackItem.getRoot());
// There should only be the root item if everything is closed properly.
}
}
function getText (stackItem) {
if (!(
stackItem instanceof BlockStackItem
|| stackItem instanceof ListItemStackItem
|| stackItem instanceof TableCellStackItem
)) {
throw new Error('Only blocks, list items and table cells can be requested for text contents.');
}
return (stackItem.inlineTextBuilder.isEmpty())
? stackItem.rawText
: stackItem.rawText + stackItem.inlineTextBuilder.toString();
}
function addText (stackItem, text, leadingLineBreaks, trailingLineBreaks) {
if (!(
stackItem instanceof BlockStackItem
|| stackItem instanceof ListItemStackItem
|| stackItem instanceof TableCellStackItem
)) {
throw new Error('Only blocks, list items and table cells can contain text.');
}
const parentText = getText(stackItem);
const lineBreaks = Math.max(stackItem.stashedLineBreaks, leadingLineBreaks);
stackItem.inlineTextBuilder.clear();
if (parentText) {
stackItem.rawText = parentText + '\n'.repeat(lineBreaks) + text;
} else {
stackItem.rawText = text;
stackItem.leadingLineBreaks = lineBreaks;
}
stackItem.stashedLineBreaks = trailingLineBreaks;
}
/**
* @param { string } str A string to transform.
* @param { TransformerStackItem } transformer A transformer item (with possible continuation).
* @returns { string }
*/
function applyTransformer (str, transformer) {
return ((transformer) ? applyTransformer(transformer.transform(str), transformer.next) : str);
}
/**
* Compile selectors into a decision tree,
* return a function intended for batch processing.
*
* @param { Options } [options = {}] HtmlToText options (defaults, formatters, user options merged, deduplicated).
* @returns { (html: string, metadata?: any) => string } Pre-configured converter function.
* @static
*/
function compile$1 (options = {}) {
const selectorsWithoutFormat = options.selectors.filter(s => !s.format);
if (selectorsWithoutFormat.length) {
throw new Error(
'Following selectors have no specified format: ' +
selectorsWithoutFormat.map(s => `\`${s.selector}\``).join(', ')
);
}
const picker = new DecisionTree(
options.selectors.map(s => [s.selector, s])
).build(hp2Builder);
if (typeof options.encodeCharacters !== 'function') {
options.encodeCharacters = makeReplacerFromDict(options.encodeCharacters);
}
const baseSelectorsPicker = new DecisionTree(
options.baseElements.selectors.map((s, i) => [s, i + 1])
).build(hp2Builder);
function findBaseElements (dom) {
return findBases(dom, options, baseSelectorsPicker);
}
const limitedWalk = limitedDepthRecursive(
options.limits.maxDepth,
recursiveWalk,
function (dom, builder) {
builder.addInline(options.limits.ellipsis || '');
}
);
return function (html, metadata = undefined) {
return process(html, metadata, options, picker, findBaseElements, limitedWalk);
};
}
/**
* Convert given HTML according to preprocessed options.
*
* @param { string } html HTML content to convert.
* @param { any } metadata Optional metadata for HTML document, for use in formatters.
* @param { Options } options HtmlToText options (preprocessed).
* @param { import('selderee').Picker<DomNode, TagDefinition> } picker
* Tag definition picker for DOM nodes processing.
* @param { (dom: DomNode[]) => DomNode[] } findBaseElements
* Function to extract elements from HTML DOM
* that will only be present in the output text.
* @param { RecursiveCallback } walk Recursive callback.
* @returns { string }
*/
function process (html, metadata, options, picker, findBaseElements, walk) {
const maxInputLength = options.limits.maxInputLength;
if (maxInputLength && html && html.length > maxInputLength) {
console.warn(
`Input length ${html.length} is above allowed limit of ${maxInputLength}. Truncating without ellipsis.`
);
html = html.substring(0, maxInputLength);
}
const document = parseDocument(html, { decodeEntities: options.decodeEntities });
const bases = findBaseElements(document.children);
const builder = new BlockTextBuilder(options, picker, metadata);
walk(bases, builder);
return builder.toString();
}
function findBases (dom, options, baseSelectorsPicker) {
const results = [];
function recursiveWalk (walk, /** @type { DomNode[] } */ dom) {
dom = dom.slice(0, options.limits.maxChildNodes);
for (const elem of dom) {
if (elem.type !== 'tag') {
continue;
}
const pickedSelectorIndex = baseSelectorsPicker.pick1(elem);
if (pickedSelectorIndex > 0) {
results.push({ selectorIndex: pickedSelectorIndex, element: elem });
} else if (elem.children) {
walk(elem.children);
}
if (results.length >= options.limits.maxBaseElements) {
return;
}
}
}
const limitedWalk = limitedDepthRecursive(
options.limits.maxDepth,
recursiveWalk
);
limitedWalk(dom);
if (options.baseElements.orderBy !== 'occurrence') { // 'selectors'
results.sort((a, b) => a.selectorIndex - b.selectorIndex);
}
return (options.baseElements.returnDomByDefault && results.length === 0)
? dom
: results.map(x => x.element);
}
/**
* Function to walk through DOM nodes and accumulate their string representations.
*
* @param { RecursiveCallback } walk Recursive callback.
* @param { DomNode[] } [dom] Nodes array to process.
* @param { BlockTextBuilder } builder Passed around to accumulate output text.
* @private
*/
function recursiveWalk (walk, dom, builder) {
if (!dom) { return; }
const options = builder.options;
const tooManyChildNodes = dom.length > options.limits.maxChildNodes;
if (tooManyChildNodes) {
dom = dom.slice(0, options.limits.maxChildNodes);
dom.push({
data: options.limits.ellipsis,
type: 'text'
});
}
for (const elem of dom) {
switch (elem.type) {
case 'text': {
builder.addInline(elem.data);
break;
}
case 'tag': {
const tagDefinition = builder.picker.pick1(elem);
const format = options.formatters[tagDefinition.format];
format(elem, walk, builder, tagDefinition.options || {});
break;
}
}
}
return;
}
/**
* @param { Object<string,string | false> } dict
* A dictionary where keys are characters to replace
* and values are replacement strings.
*
* First code point from dict keys is used.
* Compound emojis with ZWJ are not supported (not until Node 16).
*
* @returns { ((str: string) => string) | undefined }
*/
function makeReplacerFromDict (dict) {
if (!dict || Object.keys(dict).length === 0) {
return undefined;
}
/** @type { [string, string][] } */
const entries = Object.entries(dict).filter(([, v]) => v !== false);
const regex = new RegExp(
entries
.map(([c]) => `(${unicodeEscape([...c][0])})`)
.join('|'),
'g'
);
const values = entries.map(([, v]) => v);
const replacer = (m, ...cgs) => values[cgs.findIndex(cg => cg)];
return (str) => str.replace(regex, replacer);
}
/**
* Dummy formatter that discards the input and does nothing.
*
* @type { FormatCallback }
*/
function formatSkip (elem, walk, builder, formatOptions) {
/* do nothing */
}
/**
* Insert the given string literal inline instead of a tag.
*
* @type { FormatCallback }
*/
function formatInlineString (elem, walk, builder, formatOptions) {
builder.addLiteral(formatOptions.string || '');
}
/**
* Insert a block with the given string literal instead of a tag.
*
* @type { FormatCallback }
*/
function formatBlockString (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
builder.addLiteral(formatOptions.string || '');
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Process an inline-level element.
*
* @type { FormatCallback }
*/
function formatInline (elem, walk, builder, formatOptions) {
walk(elem.children, builder);
}
/**
* Process a block-level container.
*
* @type { FormatCallback }
*/
function formatBlock$1 (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
walk(elem.children, builder);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
function renderOpenTag (elem) {
const attrs = (elem.attribs && elem.attribs.length)
? ' ' + Object.entries(elem.attribs)
.map(([k, v]) => ((v === '') ? k : `${k}=${v.replace(/"/g, '&quot;')}`))
.join(' ')
: '';
return `<${elem.name}${attrs}>`;
}
function renderCloseTag (elem) {
return `</${elem.name}>`;
}
/**
* Render an element as inline HTML tag, walk through it's children.
*
* @type { FormatCallback }
*/
function formatInlineTag (elem, walk, builder, formatOptions) {
builder.startNoWrap();
builder.addLiteral(renderOpenTag(elem));
builder.stopNoWrap();
walk(elem.children, builder);
builder.startNoWrap();
builder.addLiteral(renderCloseTag(elem));
builder.stopNoWrap();
}
/**
* Render an element as HTML block bag, walk through it's children.
*
* @type { FormatCallback }
*/
function formatBlockTag (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
builder.startNoWrap();
builder.addLiteral(renderOpenTag(elem));
builder.stopNoWrap();
walk(elem.children, builder);
builder.startNoWrap();
builder.addLiteral(renderCloseTag(elem));
builder.stopNoWrap();
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Render an element with all it's children as inline HTML.
*
* @type { FormatCallback }
*/
function formatInlineHtml (elem, walk, builder, formatOptions) {
builder.startNoWrap();
builder.addLiteral(
render(elem, { decodeEntities: builder.options.decodeEntities })
);
builder.stopNoWrap();
}
/**
* Render an element with all it's children as HTML block.
*
* @type { FormatCallback }
*/
function formatBlockHtml (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
builder.startNoWrap();
builder.addLiteral(
render(elem, { decodeEntities: builder.options.decodeEntities })
);
builder.stopNoWrap();
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Render inline element wrapped with given strings.
*
* @type { FormatCallback }
*/
function formatInlineSurround (elem, walk, builder, formatOptions) {
builder.addLiteral(formatOptions.prefix || '');
walk(elem.children, builder);
builder.addLiteral(formatOptions.suffix || '');
}
var genericFormatters = /*#__PURE__*/Object.freeze({
__proto__: null,
block: formatBlock$1,
blockHtml: formatBlockHtml,
blockString: formatBlockString,
blockTag: formatBlockTag,
inline: formatInline,
inlineHtml: formatInlineHtml,
inlineString: formatInlineString,
inlineSurround: formatInlineSurround,
inlineTag: formatInlineTag,
skip: formatSkip
});
function getRow (matrix, j) {
if (!matrix[j]) { matrix[j] = []; }
return matrix[j];
}
function findFirstVacantIndex (row, x = 0) {
while (row[x]) { x++; }
return x;
}
function transposeInPlace (matrix, maxSize) {
for (let i = 0; i < maxSize; i++) {
const rowI = getRow(matrix, i);
for (let j = 0; j < i; j++) {
const rowJ = getRow(matrix, j);
if (rowI[j] || rowJ[i]) {
const temp = rowI[j];
rowI[j] = rowJ[i];
rowJ[i] = temp;
}
}
}
}
function putCellIntoLayout (cell, layout, baseRow, baseCol) {
for (let r = 0; r < cell.rowspan; r++) {
const layoutRow = getRow(layout, baseRow + r);
for (let c = 0; c < cell.colspan; c++) {
layoutRow[baseCol + c] = cell;
}
}
}
function getOrInitOffset (offsets, index) {
if (offsets[index] === undefined) {
offsets[index] = (index === 0) ? 0 : 1 + getOrInitOffset(offsets, index - 1);
}
return offsets[index];
}
function updateOffset (offsets, base, span, value) {
offsets[base + span] = Math.max(
getOrInitOffset(offsets, base + span),
getOrInitOffset(offsets, base) + value
);
}
/**
* Render a table into a string.
* Cells can contain multiline text and span across multiple rows and columns.
*
* Modifies cells to add lines array.
*
* @param { TablePrinterCell[][] } tableRows Table to render.
* @param { number } rowSpacing Number of spaces between columns.
* @param { number } colSpacing Number of empty lines between rows.
* @returns { string }
*/
function tableToString (tableRows, rowSpacing, colSpacing) {
const layout = [];
let colNumber = 0;
const rowNumber = tableRows.length;
const rowOffsets = [0];
// Fill the layout table and row offsets row-by-row.
for (let j = 0; j < rowNumber; j++) {
const layoutRow = getRow(layout, j);
const cells = tableRows[j];
let x = 0;
for (let i = 0; i < cells.length; i++) {
const cell = cells[i];
x = findFirstVacantIndex(layoutRow, x);
putCellIntoLayout(cell, layout, j, x);
x += cell.colspan;
cell.lines = cell.text.split('\n');
const cellHeight = cell.lines.length;
updateOffset(rowOffsets, j, cell.rowspan, cellHeight + rowSpacing);
}
colNumber = (layoutRow.length > colNumber) ? layoutRow.length : colNumber;
}
transposeInPlace(layout, (rowNumber > colNumber) ? rowNumber : colNumber);
const outputLines = [];
const colOffsets = [0];
// Fill column offsets and output lines column-by-column.
for (let x = 0; x < colNumber; x++) {
let y = 0;
let cell;
const rowsInThisColumn = Math.min(rowNumber, layout[x].length);
while (y < rowsInThisColumn) {
cell = layout[x][y];
if (cell) {
if (!cell.rendered) {
let cellWidth = 0;
for (let j = 0; j < cell.lines.length; j++) {
const line = cell.lines[j];
const lineOffset = rowOffsets[y] + j;
outputLines[lineOffset] = (outputLines[lineOffset] || '').padEnd(colOffsets[x]) + line;
cellWidth = (line.length > cellWidth) ? line.length : cellWidth;
}
updateOffset(colOffsets, x, cell.colspan, cellWidth + colSpacing);
cell.rendered = true;
}
y += cell.rowspan;
} else {
const lineOffset = rowOffsets[y];
outputLines[lineOffset] = (outputLines[lineOffset] || '');
y++;
}
}
}
return outputLines.join('\n');
}
/**
* Process a line-break.
*
* @type { FormatCallback }
*/
function formatLineBreak (elem, walk, builder, formatOptions) {
builder.addLineBreak();
}
/**
* Process a `wbr` tag (word break opportunity).
*
* @type { FormatCallback }
*/
function formatWbr (elem, walk, builder, formatOptions) {
builder.addWordBreakOpportunity();
}
/**
* Process a horizontal line.
*
* @type { FormatCallback }
*/
function formatHorizontalLine (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
builder.addInline('-'.repeat(formatOptions.length || builder.options.wordwrap || 40));
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Process a paragraph.
*
* @type { FormatCallback }
*/
function formatParagraph (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
walk(elem.children, builder);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Process a preformatted content.
*
* @type { FormatCallback }
*/
function formatPre (elem, walk, builder, formatOptions) {
builder.openBlock({
isPre: true,
leadingLineBreaks: formatOptions.leadingLineBreaks || 2
});
walk(elem.children, builder);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Process a heading.
*
* @type { FormatCallback }
*/
function formatHeading (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks || 2 });
if (formatOptions.uppercase !== false) {
builder.pushWordTransform(str => str.toUpperCase());
walk(elem.children, builder);
builder.popWordTransform();
} else {
walk(elem.children, builder);
}
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks || 2 });
}
/**
* Process a blockquote.
*
* @type { FormatCallback }
*/
function formatBlockquote (elem, walk, builder, formatOptions) {
builder.openBlock({
leadingLineBreaks: formatOptions.leadingLineBreaks || 2,
reservedLineLength: 2
});
walk(elem.children, builder);
builder.closeBlock({
trailingLineBreaks: formatOptions.trailingLineBreaks || 2,
blockTransform: str => ((formatOptions.trimEmptyLines !== false) ? trimCharacter(str, '\n') : str)
.split('\n')
.map(line => '> ' + line)
.join('\n')
});
}
function withBrackets (str, brackets) {
if (!brackets) { return str; }
const lbr = (typeof brackets[0] === 'string')
? brackets[0]
: '[';
const rbr = (typeof brackets[1] === 'string')
? brackets[1]
: ']';
return lbr + str + rbr;
}
function pathRewrite (path, rewriter, baseUrl, metadata, elem) {
const modifiedPath = (typeof rewriter === 'function')
? rewriter(path, metadata, elem)
: path;
return (modifiedPath[0] === '/' && baseUrl)
? trimCharacterEnd(baseUrl, '/') + modifiedPath
: modifiedPath;
}
/**
* Process an image.
*
* @type { FormatCallback }
*/
function formatImage (elem, walk, builder, formatOptions) {
const attribs = elem.attribs || {};
const alt = (attribs.alt)
? attribs.alt
: '';
const src = (!attribs.src)
? ''
: pathRewrite(attribs.src, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem);
const text = (!src)
? alt
: (!alt)
? withBrackets(src, formatOptions.linkBrackets)
: alt + ' ' + withBrackets(src, formatOptions.linkBrackets);
builder.addInline(text, { noWordTransform: true });
}
// a img baseUrl
// a img pathRewrite
// a img linkBrackets
// a ignoreHref: false
// ignoreText ?
// a noAnchorUrl: true
// can be replaced with selector
// a hideLinkHrefIfSameAsText: false
// how to compare, what to show (text, href, normalized) ?
// a mailto protocol removed without options
// a protocols: mailto, tel, ...
// can be matched with selector?
// anchors, protocols - only if no pathRewrite fn is provided
// normalize-url ?
// a
// a[href^="#"] - format:skip by default
// a[href^="mailto:"] - ?
/**
* Process an anchor.
*
* @type { FormatCallback }
*/
function formatAnchor (elem, walk, builder, formatOptions) {
function getHref () {
if (formatOptions.ignoreHref) { return ''; }
if (!elem.attribs || !elem.attribs.href) { return ''; }
let href = elem.attribs.href.replace(/^mailto:/, '');
if (formatOptions.noAnchorUrl && href[0] === '#') { return ''; }
href = pathRewrite(href, formatOptions.pathRewrite, formatOptions.baseUrl, builder.metadata, elem);
return href;
}
const href = getHref();
if (!href) {
walk(elem.children, builder);
} else {
let text = '';
builder.pushWordTransform(
str => {
if (str) { text += str; }
return str;
}
);
walk(elem.children, builder);
builder.popWordTransform();
const hideSameLink = formatOptions.hideLinkHrefIfSameAsText && href === text;
if (!hideSameLink) {
builder.addInline(
(!text)
? href
: ' ' + withBrackets(href, formatOptions.linkBrackets),
{ noWordTransform: true }
);
}
}
}
/**
* @param { DomNode } elem List items with their prefixes.
* @param { RecursiveCallback } walk Recursive callback to process child nodes.
* @param { BlockTextBuilder } builder Passed around to accumulate output text.
* @param { FormatOptions } formatOptions Options specific to a formatter.
* @param { () => string } nextPrefixCallback Function that returns increasing index each time it is called.
*/
function formatList (elem, walk, builder, formatOptions, nextPrefixCallback) {
const isNestedList = get(elem, ['parent', 'name']) === 'li';
// With Roman numbers, index length is not as straightforward as with Arabic numbers or letters,
// so the dumb length comparison is the most robust way to get the correct value.
let maxPrefixLength = 0;
const listItems = (elem.children || [])
// it might be more accurate to check only for html spaces here, but no significant benefit
.filter(child => child.type !== 'text' || !/^\s*$/.test(child.data))
.map(function (child) {
if (child.name !== 'li') {
return { node: child, prefix: '' };
}
const prefix = (isNestedList)
? nextPrefixCallback().trimStart()
: nextPrefixCallback();
if (prefix.length > maxPrefixLength) { maxPrefixLength = prefix.length; }
return { node: child, prefix: prefix };
});
if (!listItems.length) { return; }
builder.openList({
interRowLineBreaks: 1,
leadingLineBreaks: isNestedList ? 1 : (formatOptions.leadingLineBreaks || 2),
maxPrefixLength: maxPrefixLength,
prefixAlign: 'left'
});
for (const { node, prefix } of listItems) {
builder.openListItem({ prefix: prefix });
walk([node], builder);
builder.closeListItem();
}
builder.closeList({ trailingLineBreaks: isNestedList ? 1 : (formatOptions.trailingLineBreaks || 2) });
}
/**
* Process an unordered list.
*
* @type { FormatCallback }
*/
function formatUnorderedList (elem, walk, builder, formatOptions) {
const prefix = formatOptions.itemPrefix || ' * ';
return formatList(elem, walk, builder, formatOptions, () => prefix);
}
/**
* Process an ordered list.
*
* @type { FormatCallback }
*/
function formatOrderedList (elem, walk, builder, formatOptions) {
let nextIndex = Number(elem.attribs.start || '1');
const indexFunction = getOrderedListIndexFunction(elem.attribs.type);
const nextPrefixCallback = () => ' ' + indexFunction(nextIndex++) + '. ';
return formatList(elem, walk, builder, formatOptions, nextPrefixCallback);
}
/**
* Return a function that can be used to generate index markers of a specified format.
*
* @param { string } [olType='1'] Marker type.
* @returns { (i: number) => string }
*/
function getOrderedListIndexFunction (olType = '1') {
switch (olType) {
case 'a': return (i) => numberToLetterSequence(i, 'a');
case 'A': return (i) => numberToLetterSequence(i, 'A');
case 'i': return (i) => numberToRoman(i).toLowerCase();
case 'I': return (i) => numberToRoman(i);
case '1':
default: return (i) => (i).toString();
}
}
/**
* Given a list of class and ID selectors (prefixed with '.' and '#'),
* return them as separate lists of names without prefixes.
*
* @param { string[] } selectors Class and ID selectors (`[".class", "#id"]` etc).
* @returns { { classes: string[], ids: string[] } }
*/
function splitClassesAndIds (selectors) {
const classes = [];
const ids = [];
for (const selector of selectors) {
if (selector.startsWith('.')) {
classes.push(selector.substring(1));
} else if (selector.startsWith('#')) {
ids.push(selector.substring(1));
}
}
return { classes: classes, ids: ids };
}
function isDataTable (attr, tables) {
if (tables === true) { return true; }
if (!attr) { return false; }
const { classes, ids } = splitClassesAndIds(tables);
const attrClasses = (attr['class'] || '').split(' ');
const attrIds = (attr['id'] || '').split(' ');
return attrClasses.some(x => classes.includes(x)) || attrIds.some(x => ids.includes(x));
}
/**
* Process a table (either as a container or as a data table, depending on options).
*
* @type { FormatCallback }
*/
function formatTable (elem, walk, builder, formatOptions) {
return isDataTable(elem.attribs, builder.options.tables)
? formatDataTable(elem, walk, builder, formatOptions)
: formatBlock(elem, walk, builder, formatOptions);
}
function formatBlock (elem, walk, builder, formatOptions) {
builder.openBlock({ leadingLineBreaks: formatOptions.leadingLineBreaks });
walk(elem.children, builder);
builder.closeBlock({ trailingLineBreaks: formatOptions.trailingLineBreaks });
}
/**
* Process a data table.
*
* @type { FormatCallback }
*/
function formatDataTable (elem, walk, builder, formatOptions) {
builder.openTable();
elem.children.forEach(walkTable);
builder.closeTable({
tableToString: (rows) => tableToString(rows, formatOptions.rowSpacing ?? 0, formatOptions.colSpacing ?? 3),
leadingLineBreaks: formatOptions.leadingLineBreaks,
trailingLineBreaks: formatOptions.trailingLineBreaks
});
function formatCell (cellNode) {
const colspan = +get(cellNode, ['attribs', 'colspan']) || 1;
const rowspan = +get(cellNode, ['attribs', 'rowspan']) || 1;
builder.openTableCell({ maxColumnWidth: formatOptions.maxColumnWidth });
walk(cellNode.children, builder);
builder.closeTableCell({ colspan: colspan, rowspan: rowspan });
}
function walkTable (elem) {
if (elem.type !== 'tag') { return; }
const formatHeaderCell = (formatOptions.uppercaseHeaderCells !== false)
? (cellNode) => {
builder.pushWordTransform(str => str.toUpperCase());
formatCell(cellNode);
builder.popWordTransform();
}
: formatCell;
switch (elem.name) {
case 'thead':
case 'tbody':
case 'tfoot':
case 'center':
elem.children.forEach(walkTable);
return;
case 'tr': {
builder.openTableRow();
for (const childOfTr of elem.children) {
if (childOfTr.type !== 'tag') { continue; }
switch (childOfTr.name) {
case 'th': {
formatHeaderCell(childOfTr);
break;
}
case 'td': {
formatCell(childOfTr);
break;
}
// do nothing
}
}
builder.closeTableRow();
break;
}
// do nothing
}
}
}
var textFormatters = /*#__PURE__*/Object.freeze({
__proto__: null,
anchor: formatAnchor,
blockquote: formatBlockquote,
dataTable: formatDataTable,
heading: formatHeading,
horizontalLine: formatHorizontalLine,
image: formatImage,
lineBreak: formatLineBreak,
orderedList: formatOrderedList,
paragraph: formatParagraph,
pre: formatPre,
table: formatTable,
unorderedList: formatUnorderedList,
wbr: formatWbr
});
/**
* Default options.
*
* @constant
* @type { Options }
* @default
* @private
*/
const DEFAULT_OPTIONS = {
baseElements: {
selectors: [ 'body' ],
orderBy: 'selectors', // 'selectors' | 'occurrence'
returnDomByDefault: true
},
decodeEntities: true,
encodeCharacters: {},
formatters: {},
limits: {
ellipsis: '...',
maxBaseElements: undefined,
maxChildNodes: undefined,
maxDepth: undefined,
maxInputLength: (1 << 24) // 16_777_216
},
longWordSplit: {
forceWrapOnLimit: false,
wrapCharacters: []
},
preserveNewlines: false,
selectors: [
{ selector: '*', format: 'inline' },
{
selector: 'a',
format: 'anchor',
options: {
baseUrl: null,
hideLinkHrefIfSameAsText: false,
ignoreHref: false,
linkBrackets: ['[', ']'],
noAnchorUrl: true
}
},
{ selector: 'article', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{ selector: 'aside', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{
selector: 'blockquote',
format: 'blockquote',
options: { leadingLineBreaks: 2, trailingLineBreaks: 2, trimEmptyLines: true }
},
{ selector: 'br', format: 'lineBreak' },
{ selector: 'div', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{ selector: 'footer', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{ selector: 'form', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{ selector: 'h1', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } },
{ selector: 'h2', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } },
{ selector: 'h3', format: 'heading', options: { leadingLineBreaks: 3, trailingLineBreaks: 2, uppercase: true } },
{ selector: 'h4', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } },
{ selector: 'h5', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } },
{ selector: 'h6', format: 'heading', options: { leadingLineBreaks: 2, trailingLineBreaks: 2, uppercase: true } },
{ selector: 'header', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{
selector: 'hr',
format: 'horizontalLine',
options: { leadingLineBreaks: 2, length: undefined, trailingLineBreaks: 2 }
},
{
selector: 'img',
format: 'image',
options: { baseUrl: null, linkBrackets: ['[', ']'] }
},
{ selector: 'main', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{ selector: 'nav', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{
selector: 'ol',
format: 'orderedList',
options: { leadingLineBreaks: 2, trailingLineBreaks: 2 }
},
{ selector: 'p', format: 'paragraph', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } },
{ selector: 'pre', format: 'pre', options: { leadingLineBreaks: 2, trailingLineBreaks: 2 } },
{ selector: 'section', format: 'block', options: { leadingLineBreaks: 1, trailingLineBreaks: 1 } },
{
selector: 'table',
format: 'table',
options: {
colSpacing: 3,
leadingLineBreaks: 2,
maxColumnWidth: 60,
rowSpacing: 0,
trailingLineBreaks: 2,
uppercaseHeaderCells: true
}
},
{
selector: 'ul',
format: 'unorderedList',
options: { itemPrefix: ' * ', leadingLineBreaks: 2, trailingLineBreaks: 2 }
},
{ selector: 'wbr', format: 'wbr' },
],
tables: [], // deprecated
whitespaceCharacters: ' \t\r\n\f\u200b',
wordwrap: 80
};
const concatMerge = (acc, src, options) => [...acc, ...src];
const overwriteMerge = (acc, src, options) => [...src];
const selectorsMerge = (acc, src, options) => (
(acc.some(s => typeof s === 'object'))
? concatMerge(acc, src) // selectors
: overwriteMerge(acc, src) // baseElements.selectors
);
/**
* Preprocess options, compile selectors into a decision tree,
* return a function intended for batch processing.
*
* @param { Options } [options = {}] HtmlToText options.
* @returns { (html: string, metadata?: any) => string } Pre-configured converter function.
* @static
*/
function compile (options = {}) {
options = merge(
DEFAULT_OPTIONS,
options,
{
arrayMerge: overwriteMerge,
customMerge: (key) => ((key === 'selectors') ? selectorsMerge : undefined)
}
);
options.formatters = Object.assign({}, genericFormatters, textFormatters, options.formatters);
options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector));
handleDeprecatedOptions(options);
return compile$1(options);
}
/**
* Convert given HTML content to plain text string.
*
* @param { string } html HTML content to convert.
* @param { Options } [options = {}] HtmlToText options.
* @param { any } [metadata] Optional metadata for HTML document, for use in formatters.
* @returns { string } Plain text string.
* @static
*
* @example
* const { convert } = require('html-to-text');
* const text = convert('<h1>Hello World</h1>', {
* wordwrap: 130
* });
* console.log(text); // HELLO WORLD
*/
function convert (html, options = {}, metadata = undefined) {
return compile(options)(html, metadata);
}
/**
* Map previously existing and now deprecated options to the new options layout.
* This is a subject for cleanup in major releases.
*
* @param { Options } options HtmlToText options.
*/
function handleDeprecatedOptions (options) {
if (options.tags) {
const tagDefinitions = Object.entries(options.tags).map(
([selector, definition]) => ({ ...definition, selector: selector || '*' })
);
options.selectors.push(...tagDefinitions);
options.selectors = mergeDuplicatesPreferLast(options.selectors, (s => s.selector));
}
function set (obj, path, value) {
const valueKey = path.pop();
for (const key of path) {
let nested = obj[key];
if (!nested) {
nested = {};
obj[key] = nested;
}
obj = nested;
}
obj[valueKey] = value;
}
if (options['baseElement']) {
const baseElement = options['baseElement'];
set(
options,
['baseElements', 'selectors'],
(Array.isArray(baseElement) ? baseElement : [baseElement])
);
}
if (options['returnDomByDefault'] !== undefined) {
set(options, ['baseElements', 'returnDomByDefault'], options['returnDomByDefault']);
}
for (const definition of options.selectors) {
if (definition.format === 'anchor' && get(definition, ['options', 'noLinkBrackets'])) {
set(definition, ['options', 'linkBrackets'], false);
}
}
}
export { compile, convert, convert as htmlToText };