fix: use htmlrewriter2 instead of HTMLRewriter for node compat (#26309)

This commit is contained in:
Brendan Allan
2026-05-13 11:28:08 +08:00
committed by GitHub
parent 46daede10c
commit ad6a8a1850
4 changed files with 41 additions and 28 deletions

View File

@@ -430,6 +430,7 @@
"glob": "13.0.5",
"google-auth-library": "10.5.0",
"gray-matter": "4.0.3",
"htmlparser2": "8.0.2",
"ignore": "7.0.5",
"immer": "11.1.4",
"jsonc-parser": "3.3.1",

View File

@@ -141,6 +141,7 @@
"glob": "13.0.5",
"google-auth-library": "10.5.0",
"gray-matter": "4.0.3",
"htmlparser2": "8.0.2",
"ignore": "7.0.5",
"immer": "11.1.4",
"jsonc-parser": "3.3.1",

View File

@@ -1,5 +1,6 @@
import { Effect, Schema } from "effect"
import { HttpClient, HttpClientRequest } from "effect/unstable/http"
import { Parser } from "htmlparser2"
import * as Tool from "./tool"
import TurndownService from "turndown"
import DESCRIPTION from "./webfetch.txt"
@@ -139,8 +140,7 @@ export const WebFetchTool = Tool.define(
case "text":
if (contentType.includes("text/html")) {
const text = yield* Effect.promise(() => extractTextFromHTML(content))
return { output: text, title, metadata: {} }
return { output: extractTextFromHTML(content), title, metadata: {} }
}
return { output: content, title, metadata: {} }
@@ -155,35 +155,27 @@ export const WebFetchTool = Tool.define(
}),
)
async function extractTextFromHTML(html: string) {
function extractTextFromHTML(html: string) {
let text = ""
let skipContent = false
let skipDepth = 0
const rewriter = new HTMLRewriter()
.on("script, style, noscript, iframe, object, embed", {
element() {
skipContent = true
},
text() {
// Skip text content inside these elements
},
})
.on("*", {
element(element) {
// Reset skip flag when entering other elements
if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
skipContent = false
}
},
text(input) {
if (!skipContent) {
text += input.text
}
},
})
.transform(new Response(html))
const parser = new Parser({
onopentag(name) {
if (skipDepth > 0 || ["script", "style", "noscript", "iframe", "object", "embed"].includes(name)) {
skipDepth++
}
},
ontext(input) {
if (skipDepth === 0) text += input
},
onclosetag() {
if (skipDepth > 0) skipDepth--
},
})
parser.write(html)
parser.end()
await rewriter.text()
return text.trim()
}

View File

@@ -91,4 +91,23 @@ describe("tool.webfetch", () => {
}),
),
)
it.instance("extracts text from html without scripts or styles", () =>
withFetch(
() =>
new Response(
"<html><head><style>.hidden{}</style><script>alert('x')</script></head><body>Hello <b>world</b></body></html>",
{
status: 200,
headers: { "content-type": "text/html; charset=utf-8" },
},
),
(url) =>
Effect.gen(function* () {
const result = yield* exec({ url: new URL("/page.html", url).toString(), format: "text" })
expect(result.output).toBe("Hello world")
expect(result.attachments).toBeUndefined()
}),
),
)
})