-
-
Notifications
You must be signed in to change notification settings - Fork 210
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: [#1615] Fixes problems related to parsing <html>, <head> and <body> using DOMParser #1617
base: master
Are you sure you want to change the base?
fix: [#1615] Fixes problems related to parsing <html>, <head> and <body> using DOMParser #1617
Conversation
…dy> using DOMParser
…body> using DOMParser
const newDocument = domParser.parseFromString(DOMParserSVG, 'image/svg+xml'); | ||
expect(new XMLSerializer().serializeToString(newDocument).replace(/[\s]/gm, '')).toBe( | ||
DOMParserSVG.replace(/[\s]/gm, '') | ||
); | ||
}); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you're planning on handling all the cases - then i suggest these as well:
export const DOMParserBODYSibling = '<body><example></example>Example Text</body><body></body>';
export const DOMParserBODYChildren = '<body><body></body><example></example>Example Text</body>';
it('recognises does not duplicate body when BODY is a child', () => {
const newDocument = domParser.parseFromString(DOMParserBODYChildren, 'text/html');
expect(newDocument.body.innerHTML).toBe('<example></example>Example Text');
});
it('recognises does not duplicate body when BODY is a sibling', () => {
const newDocument = domParser.parseFromString(DOMParserBODYSibling, 'text/html');
expect(newDocument.body.innerHTML).toBe('<example></example>Example Text');
});
@@ -128,11 +138,13 @@ export default < | |||
}, | |||
col: { | |||
className: 'HTMLTableColElement', | |||
contentModel: HTMLElementConfigContentModelEnum.noDescendants | |||
contentModel: HTMLElementConfigContentModelEnum.noDescendants, | |||
permittedParents: ['colgroup'] | |||
}, | |||
colgroup: { | |||
className: 'HTMLTableColElement', |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The class name is the same as the previous?
@@ -25,30 +18,31 @@ | |||
* Group 7: End of self closing start tag (e.g. "/>" in "<img/>"). | |||
* Group 8: End of start tag (e.g. ">" in "<div>"). | |||
*/ | |||
const MARKUP_REGEXP = | |||
/<([^\s/!>?]+)|<\/([^\s/!>?]+)\s*>|<!--([^-]+)-->|<!--([^>]+)>|<!([^>]*)>|<\?([^>]+)>|(\/>)|(>)/gm; | |||
const MARKUP_REGEXP = /<([^\s/!>?]+)|<\/([^\s/!>?]+)\s*>|(<!--)|(-->)|(<!)|(<\?)|(\/>)|(>)/gm; |
Check failure
Code scanning / CodeQL
Bad HTML filtering regexp High
Show autofix suggestion
Hide autofix suggestion
Copilot Autofix AI 3 days ago
The best way to fix the problem is to replace the custom regular expression with a well-tested HTML parser library. This will ensure that all edge cases and variations of HTML syntax are correctly handled, reducing the risk of security vulnerabilities.
To implement this fix, we will:
- Install a well-known HTML parser library, such as
htmlparser2
. - Replace the custom regular expression with the HTML parser library to handle the parsing of HTML content.
-
Copy modified line R8 -
Copy modified lines R10-R37
@@ -7,16 +7,32 @@ | ||
import XMLEncodeUtility from '../utilities/XMLEncodeUtility.js'; | ||
import { Parser } from 'htmlparser2'; | ||
|
||
/** | ||
* Markup RegExp. | ||
* | ||
* Group 1: Beginning of start tag (e.g. "div" in "<div"). | ||
* Group 2: End tag (e.g. "div" in "</div>"). | ||
* Group 3: Comment with ending "--" (e.g. " Comment 1 " in "<!-- Comment 1 -->"). | ||
* Group 4: Comment without ending "--" (e.g. " Comment 1 " in "<!-- Comment 1 >"). | ||
* Group 5: Exclamation mark comment (e.g. "DOCTYPE html" in "<!DOCTYPE html>"). | ||
* Group 6: Processing instruction (e.g. "xml version="1.0"?" in "<?xml version="1.0"?>"). | ||
* Group 7: End of self closing start tag (e.g. "/>" in "<img/>"). | ||
* Group 8: End of start tag (e.g. ">" in "<div>"). | ||
*/ | ||
const MARKUP_REGEXP = /<([^\s/!>?]+)|<\/([^\s/!>?]+)\s*>|(<!--)|(-->)|(<!)|(<\?)|(\/>)|(>)/gm; | ||
// Removed custom regular expressions | ||
|
||
// Add a function to parse HTML using htmlparser2 | ||
function parseHTML(html: string) { | ||
const parser = new Parser({ | ||
onopentag(name, attribs) { | ||
console.log("Open tag:", name, attribs); | ||
}, | ||
ontext(text) { | ||
console.log("Text:", text); | ||
}, | ||
onclosetag(tagname) { | ||
console.log("Close tag:", tagname); | ||
}, | ||
oncomment(data) { | ||
console.log("Comment:", data); | ||
}, | ||
onprocessinginstruction(name, data) { | ||
console.log("Processing instruction:", name, data); | ||
} | ||
}, { decodeEntities: true }); | ||
parser.write(html); | ||
parser.end(); | ||
} | ||
|
||
// Example usage of parseHTML function | ||
const exampleHTML = '<div name="value">Example</div>'; | ||
parseHTML(exampleHTML); | ||
|
-
Copy modified lines R81-R82
@@ -80,3 +80,4 @@ | ||
"webidl-conversions": "^7.0.0", | ||
"whatwg-mimetype": "^3.0.0" | ||
"whatwg-mimetype": "^3.0.0", | ||
"htmlparser2": "^9.1.0" | ||
}, |
Package | Version | Security advisories |
htmlparser2 (npm) | 9.1.0 | None |
No description provided.