🤖 Merge PR #49531 html-to-text: upgrade to v6.0 by @webstech

html-to-text v6 includes breaking changes with the use of formatters.
Basic code with no formatter routines should have a transparent
migration.  Code using formatters should review the base code
documentation and the test case code included in this package.

Signed-off-by: Chris. Webster <chris@webstech.net>
This commit is contained in:
Chris. Webster
2020-12-03 11:04:52 -08:00
committed by GitHub
parent 31cadde89d
commit 4a503edfec
4 changed files with 519 additions and 147 deletions

View File

@@ -1,27 +1,84 @@
import { fromString, HtmlToTextOptions } from 'html-to-text';
import { FormatCallback, htmlToText, HtmlToTextOptions,
TagDefinition } from 'html-to-text';
import * as formatters from 'html-to-text/lib/formatter';
// Test code that also provides sample implementations
const headerOptions: TagDefinition = {
options: {
uppercase: false
},
format: "headerFormatter",
};
// Sample use of FormatCallback outside of HtmlTextOptions
const headerFormatter: FormatCallback = (elem, walk, builder, options) => {
builder.openBlock(options.leadingLineBreaks || 2);
walk(elem.children, builder);
builder.closeBlock(options.trailingLineBreaks || 2,
str => {
const underline = str.substr(str.lastIndexOf("\n") + 1)
.replace(/./g, "=");
return `${str}\n${underline}`;
}
);
};
const htmlOptions: HtmlToTextOptions = {
wordwrap: null,
tables: true,
hideLinkHrefIfSameAsText: true,
ignoreImage: true,
format: {
text: (el, options) => {
return formatters.text(el, options);
formatters: {
headerFormatter: (elem, walk, builder, options) => {
builder.openBlock(options.leadingLineBreaks || 2);
walk(elem.children, builder);
builder.closeBlock(options.trailingLineBreaks || 2,
str => `${str} **hdr**\n`);
},
table: (el, walk, options) => {
return formatters.table(el, walk, options);
blockFormatter: (elem, walk, builder, options) => {
builder.openBlock(options.leadingLineBreaks || 2, 2);
walk(elem.children, builder);
builder.closeBlock(options.trailingLineBreaks || 2,
str => `**blk** ${str}\n`);
},
textFormatter: (elem, walk, builder, options) => {
formatters.heading(elem, walk, builder, options);
},
},
tags: {
a: {
options: {
hideLinkHrefIfSameAsText: true,
},
},
h1: headerOptions,
h3: {
format: "textFormatter",
},
blockquote: {
options: {
trimEmptyLines: false
},
format: "blockFormatter",
},
},
};
const htmlString = '<p><b>bold</b></p><p><i>italic</i></p>';
const htmlString = `<h1>h1</h1><p><b>bold</b></p><p><i>italic</i></p>
<h3>h3</h3><blockquote>block quote</blockquote>`;
console.log('Processing string with default options');
console.log(fromString(htmlString));
console.log(htmlToText(htmlString));
console.log('Processing string with custom options');
console.log(fromString(htmlString, htmlOptions));
const text = htmlToText(htmlString, htmlOptions);
console.log(text);
if (!text.match(/\*\*hdr\*\*/)) {
console.error("Formatter not called!");
}
const allElements = '<a>a</a>\
<blockquote>b</blockquote>\
@@ -34,41 +91,29 @@ const allElements = '<a>a</a>\
<table></table>\
<ul></ul>';
const elementFormatter: FormatCallback = (elem, walk, builder, options) => {
builder.openBlock(options.leadingLineBreaks || 2);
// walk(elem.children?, builder);
builder.closeBlock(options.trailingLineBreaks || 2,
str => {
return `--${elem.name}--\n`;
}
);
};
const fmtOptions: HtmlToTextOptions = {
format: {
anchor: (_el, _walk, _options) => {
return "--anchor--\n";
},
blockquote: (_el, _walk, _options) => {
return "--blockquote--\n";
},
heading: (_el, _walk, _options) => {
return "--heading--\n";
},
horizontalLine: (_el, _walk, _options) => {
return "--horizontalLine--\n";
},
image: (_el, _options) => {
return "--image--\n";
},
lineBreak: (_el, _walk, _options) => {
return "--lineBreak--\n";
},
orderedList: (_el, _walk, _options) => {
return "--orderedList--\n";
},
paragraph: (_el, _walk, _options) => {
return "--paragraph--\n";
},
table: (_el, _walk, _options) => {
return "--table--\n";
},
text: (_el, _options) => {
return "--text--\n";
},
unorderedList: (_el, _walk, _options) => {
return "--unorderedList--\n";
},
formatters: {
anchor: elementFormatter,
blockquote: elementFormatter,
heading: elementFormatter,
horizontalLine: elementFormatter,
image: elementFormatter,
inline: elementFormatter,
lineBreak: elementFormatter,
orderedList: elementFormatter,
paragraph: elementFormatter,
table: elementFormatter,
unorderedList: elementFormatter,
},
};
console.log(fromString(allElements, fmtOptions));
console.log(htmlToText(allElements, fmtOptions));

View File

@@ -1,36 +1,106 @@
// Type definitions for html-to-text 5.1
// Project: https://github.com/werk85/node-html-to-text
// Type definitions for html-to-text 6.0
// Project: https://github.com/html-to-text/node-html-to-text
// Definitions by: Eryk Warren <https://github.com/erykwarren>
// Carson Full <https://github.com/CarsonF>
// Chris. Webster <https://github.com/webstech>
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
import { BlockTextBuilder } from './lib/block-text-builder';
/**
* Convert html string to text
* Convert given HTML content to plain text string.
*
* @param str String of html content
* @param options Hash of options
*
* @return String with the converted text.
* @example
* const { htmlToText } = require('html-to-text');
* const text = htmlToText('<h1>Hello World</h1>', {
* wordwrap: 130
* });
* console.log(text); // HELLO WORLD
*/
export function fromString(str: string, options?: HtmlToTextOptions): string;
export function htmlToText(html: string, options?: HtmlToTextOptions): string;
/**
* @deprecated Import/require `{ htmlToText }` function instead!
* @see htmlToText
*/
export function fromString(html: string, options?: HtmlToTextOptions): string;
export interface HtmlToTextOptions {
/**
* Defines after how many chars a line break should follow in p elements.
* Set to null or false to disable word-wrapping. Default: 80
* The resulting text output will be composed from the text content of this element
* (or elements if an array of strings is specified).
*
* Each entry is a single tag name with optional css class and id parameters,
* e.g. `['p.class1.class2#id1#id2', 'p.class1.class2#id1#id2']`.
*/
wordwrap?: number | false | null;
baseElement?: string | string[];
/**
* Allows to select certain tables by the class or id attribute from the HTML
* document. This is necessary because the majority of HTML E-Mails uses a
* table based layout. Prefix your table selectors with an . for the class
* and with a # for the id attribute. All other tables are ignored.
* You can assign true to this attribute to select all tables. Default: []
* Text decoding options given to `he.decode`.
*
* For more informations see the [he](https://github.com/mathiasbynens/he) module.
*/
decodeOptions?: DecodeOptions;
/**
* A dictionary with custom formatting functions for specific kinds of elements.
*
* Keys are custom string identifiers, values are callbacks.
*/
formatters?: Record<string, FormatCallback>;
/**
* Options for handling complex documents and limiting the output size.
*/
limits?: LimitsOptions;
/**
* Describes how to wrap long words.
*/
longWordSplit?: LongWordSplitOptions;
/**
* By default, any newlines `\n` from the input HTML are dropped.
*
* If `true`, these newlines will be preserved in the output.
*/
preserveNewlines?: boolean;
/**
* Use the entire document if we don't find the tag defined in `Options.baseElement`.
*/
returnDomByDefault?: boolean;
/**
* Allows to select and format certain tables by the `class` or `id` attribute from the HTML document.
*
* This is necessary because the majority of HTML E-Mails uses a table based layout.
*
* Prefix your table selectors with a `.` for the `class` and with a `#` for the `id` attribute.
* All other tables are ignored (processed as layout containers, not tabular data).
*
* You can assign `true` to this property to format all tables.
*/
tables?: string[] | boolean;
/**
* A dictionary with custom tag definitions.
*
* Use this to (re)define how to handle new or already supported tags.
*
* Empty string (`''`) as a key used for the default definition for "any other" tags.
*/
tags?: TagDefinitions;
/**
* All characters that are considered whitespace.
* Default is according to HTML specifications.
*/
whitespaceCharacters?: string;
/**
* After how many chars a line break should follow in `p` elements.
*
* Set to `null` or `false` to disable word-wrapping.
*/
wordwrap?: number | boolean | null;
/**
* The following are deprecated options. See the documentation.
*/
/**
* @deprecated See the documentation.
* By default links are translated the following
* <a href='link'>text</a> => becomes => text [link].
* If this option is set to true and link and text are the same,
@@ -39,6 +109,7 @@ export interface HtmlToTextOptions {
hideLinkHrefIfSameAsText?: boolean;
/**
* @deprecated See the documentation.
* Allows you to specify the server host for href attributes, where the links start at the root (/).
* For example, linkHrefBaseUrl = 'http://asdf.com' and <a href='/dir/subdir'>...</a>
* the link in the text will be http://asdf.com/dir/subdir.
@@ -47,112 +118,291 @@ export interface HtmlToTextOptions {
linkHrefBaseUrl?: string;
/**
* @deprecated See the documentation.
* Ignore all document links if true.
*/
ignoreHref?: boolean;
/**
* @deprecated See the documentation.
* Ignore all document images if true.
*/
ignoreImage?: boolean;
/**
* @deprecated See the documentation.
* Dont print brackets around the link if true
*/
noLinkBrackets?: boolean;
/**
* By default, any newlines \n in a block of text will be removed.
* If true, these newlines will not be removed.
*/
preserveNewlines?: boolean;
/**
* @deprecated See the documentation.
* By default, headings (<h1>, <h2>, etc) are upper-cased.
* Set to false to leave headings as they are.
*/
uppercaseHeadings?: boolean;
/**
* @deprecated See the documentation.
* By default, paragraphs are converted with two newlines (\n\n).
* Set to true to convert to a single newline.
*/
singleNewLineParagraphs?: boolean;
/**
* defines the tags whose text content will be captured from the html.
* All content will be captured below the baseElement tags and added to the
* resulting text output. This option allows the user to specify an array
* of elements as base elements using a single tag with css class and id
* parameters e.g. `[p.class1.class2#id1#id2, p.class1.class2#id1#id2]`.
* Default: `"body"`
*/
baseElement?: string | string[];
/**
* convert the entire document if we don't find the tag we're looking for
* if true
*/
returnDomByDefault?: boolean;
/**
* defines the text decoding options given to `he.decode`
* For more information see the [he](https://github.com/mathiasbynens/he) module
*/
decodeOptions?: {
isAttributeValue: boolean;
strict: boolean;
};
/**
* describes how to wrap long words
*/
longWordSplit?: {
/**
* an array containing the characters that may be wrapped on.
* These are used in order.
*/
wrapCharacters: string[];
/**
* defines whether to break long words on the limit if true.
*/
forceWrapOnLimit: boolean;
};
/**
* Customize the formatting of individual element types.
*/
format?: Formatters;
/**
* @deprecated See the documentation.
* defines the string that is used as item prefix for unordered lists `<ol>`.
* Default: ' * '
*/
unorderedListItemPrefix?: string;
}
export interface Formatters {
text?: LeafFormatter;
image?: LeafFormatter;
lineBreak?: Formatter;
paragraph?: Formatter;
anchor?: Formatter;
blockquote?: Formatter;
heading?: Formatter;
table?: Formatter;
orderedList?: Formatter;
unorderedList?: Formatter;
listItem?: Formatter;
horizontalLine?: Formatter;
/**
* Text decoding options given to `he.decode`.
*
* For more informations see the [he](https://github.com/mathiasbynens/he) module.
*/
export interface DecodeOptions {
/**
* TLDR: If set to `true` - leave attribute values raw, don't parse them as text content.
*/
isAttributeValue?: boolean;
/**
* TLDR: If set to `true` - throw an error on invalid HTML input.
*/
strict?: boolean;
}
export type LeafFormatter<T = any> = (
el: T,
options: HtmlToTextOptions
) => string;
/**
* Options for handling complex documents and limiting the output size.
*/
export interface LimitsOptions {
/**
* ...]
* A string to put in place of skipped content.
*/
ellipsis?: string;
/**
* Process only this many child nodes of any element.
*
* Remaining nodes, if any, will be replaced with ellipsis.
*
* Text nodes are counted along with tags.
*
* No limit if undefined.
*/
maxChildNodes?: number;
/**
* Only go to a certain depth starting from `Options.baseElement`.
*
* Replace deeper nodes with ellipsis.
*
* No depth limit if undefined.
*/
maxDepth?: number;
/**
* If the input string is longer than this value - it will be truncated
* and a message will be sent to `stderr`.
*
* Ellipsis is not used in this case.
*/
maxInputLength?: number;
}
export type Formatter<T = any> = (
el: T,
walk: (dom: any[], options: HtmlToTextOptions) => string,
options: HtmlToTextOptions
) => string;
/**
* Describes how to wrap long words.
*/
export interface LongWordSplitOptions {
/**
* Break long words on the `Options.wordwrap` limit when there are no characters to wrap on.
*/
forceWrapOnLimit?: boolean;
/**
* An array containing the characters that may be wrapped on.
*/
wrapCharacters?: string[];
}
/**
* Describes how to handle a tag.
*/
export interface TagDefinition {
/**
* Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary.
*/
format?: string;
/**
* Options to customize the formatter for this tag.
*/
options?: FormatOptions;
}
/**
* Options specific to different formatters ({@link FormatCallback}).
* This is an umbrella type definition. Each formatter supports it's own subset of options.
*/
export interface FormatOptions {
/**
* Number of line breaks to separate previous block from this one.
*
* Note that N+1 line breaks are needed to make N empty lines.
*/
leadingLineBreaks?: number;
/**
* Number of line breaks to separate this block from the next one.
*
* Note that N+1 line breaks are needed to make N empty lines.
*/
trailingLineBreaks?: number;
/**
* (Only for: `anchor` and `image` formatters.) Server host for link `href` attributes and image `src` attributes
* relative to the root (the ones that start with `/`).
*
* For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>`
* the link in the text will be `http://asdf.com/dir/subdir`.
*
* Keep in mind that `baseUrl` should not end with a `/`.
*/
baseUrl?: string;
/**
* (Only for: `anchor` formatter.) By default links are translated in the following way:
*
* `<a href='link'>text</a>` => becomes => `text [link]`.
*
* If this option is set to `true` and `link` and `text` are the same,
* `[link]` will be omitted and only `text` will be present.
*/
hideLinkHrefIfSameAsText?: boolean;
/**
* (Only for: `anchor` formatter.) Ignore all links. Only process internal text of anchor tags.
*/
ignoreHref?: boolean;
/**
* (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`).
*/
noAnchorUrl?: boolean;
/**
* (Only for: `anchor` formatter.) Don't print brackets around links.
*/
noLinkBrackets?: boolean;
/**
* (Only for: `unorderedList` formatter.) String prefix for each list item.
*/
itemPrefix?: string;
/**
* (Only for: `heading` formatter.) By default, headings (`<h1>`, `<h2>`, etc) are uppercased.
*
* Set this to `false` to leave headings as they are.
*/
uppercase?: boolean;
/**
* (Only for: `horizontalLine` formatter.) Length of the `<hr/>` line.
*
* If numeric value is provided - it is used.
* Otherwise, if global `wordwrap` number is provided - it is used.
* If neither is true, then the fallback value of 40 is used.
*/
length?: number;
/**
* (Only for: `blockquote` formatter.) Trim empty lines from blockquote.
*/
trimEmptyLines?: boolean;
/**
* (Only for: `table`, `dataTable` formatter.) By default, heading cells (`<th>`) are uppercased.
*
* Set this to `false` to leave heading cells as they are.
*/
uppercaseHeaderCells?: boolean;
/**
* (Only for: `table`, `dataTable` formatter.) Data table cell content will be wrapped to fit this width
* instead of global `wordwrap` limit.
*
* Set to `undefined` in order to fall back to `wordwrap` limit.
*/
maxColumnWidth?: number;
/**
* (Only for: `table`, `dataTable` formatter.) Number of spaces between data table columns.
*/
colSpacing?: number;
/**
* (Only for: `table`, `dataTable` formatter.) Number of empty lines between data table rows.
*/
rowSpacing?: number;
}
/**
* Simplified definition of [htmlparser2](https://github.com/fb55/htmlparser2) Node type.
*
* Makes no distinction between elements (tags) and data nodes (good enough for now).
*/
export interface DomNode {
/**
* Type of node - "text", "tag", "comment", "script", etc.
*/
type: string;
/**
* Content of a data node.
*/
data?: string;
/**
* Tag name.
*/
name?: string;
/**
* Tag attributes dictionary.
*/
attribs?: any;
/**
* Child nodes.
* Not optional for typescript use.
*/
children: DomNode[];
/**
* Parent node.
*/
parent?: DomNode;
}
/**
* A function to stringify a DOM node.
*/
export type FormatCallback = (elem: DomNode, walk: RecursiveCallback,
builder: BlockTextBuilder, formatOptions: FormatOptions) => void;
/**
* A function to process child nodes.
* Passed into a {@link FormatCallback} as an argument.
*/
export type RecursiveCallback = (nodes: DomNode[], builder: BlockTextBuilder) => void;
/**
* Type of object passed to tags in the options.
*/
export interface TagDefinitions {
''?: TagDefinition;
a?: TagDefinition;
article?: TagDefinition;
aside?: TagDefinition;
blockquote?: TagDefinition;
br?: TagDefinition;
div?: TagDefinition;
footer?: TagDefinition;
form?: TagDefinition;
h1?: TagDefinition;
h2?: TagDefinition;
h3?: TagDefinition;
h4?: TagDefinition;
h5?: TagDefinition;
h6?: TagDefinition;
header?: TagDefinition;
hr?: TagDefinition;
img?: TagDefinition;
main?: TagDefinition;
nav?: TagDefinition;
ol?: TagDefinition;
p?: TagDefinition;
pre?: TagDefinition;
table?: TagDefinition;
ul?: TagDefinition;
wbr?: TagDefinition;
}

View File

@@ -0,0 +1,70 @@
/**
* Helps to build text from inline and block elements.
*/
export interface BlockTextBuilder {
/**
* Put a word-by-word transform function onto the transformations stack.
*
* Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs.
*
* Word transformations applied before wrapping.
*/
pushWordTransform(wordTransform: (str: string) => string): void;
/**
* Remove a function from the word transformations stack.
*/
popWordTransform(): ((str: string) => string) | undefined;
/**
* Add a line break into currently built block.
*/
addLineBreak(): void;
/**
* Allow to break line in case directly following text will not fit.
*/
addWordBreakOpportunity(): void;
/**
* Add a node inline into the currently built block.
*/
addInline(str: string, noWordTransform?: boolean): void;
/**
* Start building a new block.
*/
openBlock(leadingLineBreaks?: number, reservedLineLength?: number, isPre?: boolean): void;
/**
* Finalize currently built block, add it's content to the parent block.
*
* A function to transform the block text before adding to the parent block.
* This happens after word wrap and should be used in combination with reserved line length
* in order to keep line lengths correct.
* Used for whole block markup.
*/
closeBlock(trailingLineBreaks?: number, blockTransform?: (str: string) => string): void;
/**
* Start building a table.
*/
openTable(): void;
/**
* Start building a table row.
*/
openTableRow(): void;
/**
* Start building a table cell.
*/
openTableCell(maxColumnWidth?: number): void;
/**
* Finalize currently built table cell and add it to parent table row's cells.
*/
closeTableCell(colspan?: number, rowspan?: number): void;
/**
* Finalize currently built table row and add it to parent table's rows.
*/
closeTableRow(): void;
/**
* Finalize currently built table and add the rendered text to the parent block.
*/
closeTable(colSpacing?: number, rowSpacing?: number, leadingLineBreaks?: number, trailingLineBreaks?: number): void;
/**
* Return the rendered text content of this builder.
*/
toString(): string;
}

View File

@@ -1,14 +1,21 @@
import { Formatter, LeafFormatter } from '..';
import { FormatCallback } from '..';
export const text: LeafFormatter;
export const image: LeafFormatter;
export const lineBreak: Formatter;
export const paragraph: Formatter;
export const anchor: Formatter;
export const blockquote: Formatter;
export const heading: Formatter;
export const table: Formatter;
export const orderedList: Formatter;
export const unorderedList: Formatter;
export const listItem: Formatter;
export const horizontalLine: Formatter;
/**
* Export type formatters
*/
export const anchor: FormatCallback;
export const block: FormatCallback;
export const blockquote: FormatCallback;
export const dataTable: FormatCallback;
export const heading: FormatCallback;
export const horizontalLine: FormatCallback;
export const image: FormatCallback;
export const inline: FormatCallback;
export const lineBreak: FormatCallback;
export const orderedList: FormatCallback;
export const paragraph: FormatCallback;
export const pre: FormatCallback;
export const skip: FormatCallback;
export const table: FormatCallback;
export const unorderedList: FormatCallback;
export const wbr: FormatCallback;