mirror of
https://github.com/anomalyco/opencode.git
synced 2026-05-13 23:52:06 +00:00
fix: use htmlrewriter2 instead of HTMLRewriter for node compat (#26309)
This commit is contained in:
1
bun.lock
1
bun.lock
@@ -430,6 +430,7 @@
|
||||
"glob": "13.0.5",
|
||||
"google-auth-library": "10.5.0",
|
||||
"gray-matter": "4.0.3",
|
||||
"htmlparser2": "8.0.2",
|
||||
"ignore": "7.0.5",
|
||||
"immer": "11.1.4",
|
||||
"jsonc-parser": "3.3.1",
|
||||
|
||||
@@ -141,6 +141,7 @@
|
||||
"glob": "13.0.5",
|
||||
"google-auth-library": "10.5.0",
|
||||
"gray-matter": "4.0.3",
|
||||
"htmlparser2": "8.0.2",
|
||||
"ignore": "7.0.5",
|
||||
"immer": "11.1.4",
|
||||
"jsonc-parser": "3.3.1",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { Effect, Schema } from "effect"
|
||||
import { HttpClient, HttpClientRequest } from "effect/unstable/http"
|
||||
import { Parser } from "htmlparser2"
|
||||
import * as Tool from "./tool"
|
||||
import TurndownService from "turndown"
|
||||
import DESCRIPTION from "./webfetch.txt"
|
||||
@@ -139,8 +140,7 @@ export const WebFetchTool = Tool.define(
|
||||
|
||||
case "text":
|
||||
if (contentType.includes("text/html")) {
|
||||
const text = yield* Effect.promise(() => extractTextFromHTML(content))
|
||||
return { output: text, title, metadata: {} }
|
||||
return { output: extractTextFromHTML(content), title, metadata: {} }
|
||||
}
|
||||
return { output: content, title, metadata: {} }
|
||||
|
||||
@@ -155,35 +155,27 @@ export const WebFetchTool = Tool.define(
|
||||
}),
|
||||
)
|
||||
|
||||
async function extractTextFromHTML(html: string) {
|
||||
function extractTextFromHTML(html: string) {
|
||||
let text = ""
|
||||
let skipContent = false
|
||||
let skipDepth = 0
|
||||
|
||||
const rewriter = new HTMLRewriter()
|
||||
.on("script, style, noscript, iframe, object, embed", {
|
||||
element() {
|
||||
skipContent = true
|
||||
},
|
||||
text() {
|
||||
// Skip text content inside these elements
|
||||
},
|
||||
})
|
||||
.on("*", {
|
||||
element(element) {
|
||||
// Reset skip flag when entering other elements
|
||||
if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
|
||||
skipContent = false
|
||||
}
|
||||
},
|
||||
text(input) {
|
||||
if (!skipContent) {
|
||||
text += input.text
|
||||
}
|
||||
},
|
||||
})
|
||||
.transform(new Response(html))
|
||||
const parser = new Parser({
|
||||
onopentag(name) {
|
||||
if (skipDepth > 0 || ["script", "style", "noscript", "iframe", "object", "embed"].includes(name)) {
|
||||
skipDepth++
|
||||
}
|
||||
},
|
||||
ontext(input) {
|
||||
if (skipDepth === 0) text += input
|
||||
},
|
||||
onclosetag() {
|
||||
if (skipDepth > 0) skipDepth--
|
||||
},
|
||||
})
|
||||
|
||||
parser.write(html)
|
||||
parser.end()
|
||||
|
||||
await rewriter.text()
|
||||
return text.trim()
|
||||
}
|
||||
|
||||
|
||||
@@ -91,4 +91,23 @@ describe("tool.webfetch", () => {
|
||||
}),
|
||||
),
|
||||
)
|
||||
|
||||
it.instance("extracts text from html without scripts or styles", () =>
|
||||
withFetch(
|
||||
() =>
|
||||
new Response(
|
||||
"<html><head><style>.hidden{}</style><script>alert('x')</script></head><body>Hello <b>world</b></body></html>",
|
||||
{
|
||||
status: 200,
|
||||
headers: { "content-type": "text/html; charset=utf-8" },
|
||||
},
|
||||
),
|
||||
(url) =>
|
||||
Effect.gen(function* () {
|
||||
const result = yield* exec({ url: new URL("/page.html", url).toString(), format: "text" })
|
||||
expect(result.output).toBe("Hello world")
|
||||
expect(result.attachments).toBeUndefined()
|
||||
}),
|
||||
),
|
||||
)
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user