fix: use htmlrewriter2 instead of HTMLRewriter for node compat (#26309)

2026-05-13 23:52:06 +00:00 · 2026-05-13 11:28:08 +08:00
parent 46daede10c
commit ad6a8a1850
4 changed files with 41 additions and 28 deletions
--- a/bun.lock
+++ b/bun.lock
@@ -430,6 +430,7 @@
        "glob": "13.0.5",
        "google-auth-library": "10.5.0",
        "gray-matter": "4.0.3",
+        "htmlparser2": "8.0.2",
        "ignore": "7.0.5",
        "immer": "11.1.4",
        "jsonc-parser": "3.3.1",
--- a/packages/opencode/package.json
+++ b/packages/opencode/package.json
@@ -141,6 +141,7 @@
    "glob": "13.0.5",
    "google-auth-library": "10.5.0",
    "gray-matter": "4.0.3",
+    "htmlparser2": "8.0.2",
    "ignore": "7.0.5",
    "immer": "11.1.4",
    "jsonc-parser": "3.3.1",
--- a/packages/opencode/src/tool/webfetch.ts
+++ b/packages/opencode/src/tool/webfetch.ts
@@ -1,5 +1,6 @@
 import { Effect, Schema } from "effect"
 import { HttpClient, HttpClientRequest } from "effect/unstable/http"
+import { Parser } from "htmlparser2"
 import * as Tool from "./tool"
 import TurndownService from "turndown"
 import DESCRIPTION from "./webfetch.txt"
@@ -139,8 +140,7 @@ export const WebFetchTool = Tool.define(

            case "text":
              if (contentType.includes("text/html")) {
-                const text = yield* Effect.promise(() => extractTextFromHTML(content))
-                return { output: text, title, metadata: {} }
+                return { output: extractTextFromHTML(content), title, metadata: {} }
              }
              return { output: content, title, metadata: {} }

@@ -155,35 +155,27 @@ export const WebFetchTool = Tool.define(
  }),
 )

-async function extractTextFromHTML(html: string) {
+function extractTextFromHTML(html: string) {
  let text = ""
-  let skipContent = false
+  let skipDepth = 0

-  const rewriter = new HTMLRewriter()
-    .on("script, style, noscript, iframe, object, embed", {
-      element() {
-        skipContent = true
-      },
-      text() {
-        // Skip text content inside these elements
-      },
-    })
-    .on("*", {
-      element(element) {
-        // Reset skip flag when entering other elements
-        if (!["script", "style", "noscript", "iframe", "object", "embed"].includes(element.tagName)) {
-          skipContent = false
-        }
-      },
-      text(input) {
-        if (!skipContent) {
-          text += input.text
-        }
-      },
-    })
-    .transform(new Response(html))
+  const parser = new Parser({
+    onopentag(name) {
+      if (skipDepth > 0 || ["script", "style", "noscript", "iframe", "object", "embed"].includes(name)) {
+        skipDepth++
+      }
+    },
+    ontext(input) {
+      if (skipDepth === 0) text += input
+    },
+    onclosetag() {
+      if (skipDepth > 0) skipDepth--
+    },
+  })
+
+  parser.write(html)
+  parser.end()

-  await rewriter.text()
  return text.trim()
 }

--- a/packages/opencode/test/tool/webfetch.test.ts
+++ b/packages/opencode/test/tool/webfetch.test.ts
@@ -91,4 +91,23 @@ describe("tool.webfetch", () => {
        }),
    ),
  )
+
+  it.instance("extracts text from html without scripts or styles", () =>
+    withFetch(
+      () =>
+        new Response(
+          "<html><head><style>.hidden{}</style><script>alert('x')</script></head><body>Hello <b>world</b></body></html>",
+          {
+            status: 200,
+            headers: { "content-type": "text/html; charset=utf-8" },
+          },
+        ),
+      (url) =>
+        Effect.gen(function* () {
+          const result = yield* exec({ url: new URL("/page.html", url).toString(), format: "text" })
+          expect(result.output).toBe("Hello world")
+          expect(result.attachments).toBeUndefined()
+        }),
+    ),
+  )
 })