diff --git a/src/core/mentions/__tests__/fetchUrlContent.spec.ts b/src/core/mentions/__tests__/fetchUrlContent.spec.ts
new file mode 100644
index 0000000000..571135a386
--- /dev/null
+++ b/src/core/mentions/__tests__/fetchUrlContent.spec.ts
@@ -0,0 +1,118 @@
+// npx vitest core/mentions/__tests__/fetchUrlContent.spec.ts
+
+import axios from "axios"
+
+import { fetchUrlContent } from "../fetchUrlContent"
+
+vi.mock("axios")
+
+describe("fetchUrlContent", () => {
+ beforeEach(() => {
+ vi.clearAllMocks()
+ })
+
+ it("should fetch and extract text from HTML content", async () => {
+ vi.mocked(axios.get).mockResolvedValueOnce({
+ headers: { "content-type": "text/html; charset=utf-8" },
+ data: `
+
+
Test Page
+
+
+
+
+
+ Hello World
+ This is the main content of the page.
+
+
+
+
+ `,
+ })
+
+ const result = await fetchUrlContent("https://example.com")
+
+ expect(result.url).toBe("https://example.com")
+ expect(result.content).toContain("Hello World")
+ expect(result.content).toContain("This is the main content of the page.")
+ // Script/style/nav/footer should be removed
+ expect(result.content).not.toContain("ignore me")
+ expect(result.content).not.toContain("Navigation links")
+ expect(result.content).not.toContain("Footer content")
+ expect(result.truncated).toBe(false)
+ })
+
+ it("should return raw text for non-HTML content", async () => {
+ vi.mocked(axios.get).mockResolvedValueOnce({
+ headers: { "content-type": "text/plain" },
+ data: "Plain text content from the URL",
+ })
+
+ const result = await fetchUrlContent("https://example.com/file.txt")
+
+ expect(result.content).toBe("Plain text content from the URL")
+ expect(result.truncated).toBe(false)
+ })
+
+ it("should handle JSON content type as raw text", async () => {
+ vi.mocked(axios.get).mockResolvedValueOnce({
+ headers: { "content-type": "application/json" },
+ data: '{"key": "value"}',
+ })
+
+ const result = await fetchUrlContent("https://example.com/api/data")
+
+ expect(result.content).toBe('{"key": "value"}')
+ })
+
+ it("should truncate content that exceeds the max length", async () => {
+ const longContent = "x".repeat(60_000)
+ vi.mocked(axios.get).mockResolvedValueOnce({
+ headers: { "content-type": "text/plain" },
+ data: longContent,
+ })
+
+ const result = await fetchUrlContent("https://example.com/large")
+
+ expect(result.truncated).toBe(true)
+ expect(result.content.length).toBe(50_000)
+ })
+
+ it("should propagate axios errors", async () => {
+ vi.mocked(axios.get).mockRejectedValueOnce(new Error("Request failed with status code 404"))
+
+ await expect(fetchUrlContent("https://example.com/not-found")).rejects.toThrow(
+ "Request failed with status code 404",
+ )
+ })
+
+ it("should use body as fallback when no main/article element exists", async () => {
+ vi.mocked(axios.get).mockResolvedValueOnce({
+ headers: { "content-type": "text/html" },
+ data: `
+
+
+ Some body content without semantic elements
+
+
+ `,
+ })
+
+ const result = await fetchUrlContent("https://example.com/simple")
+
+ expect(result.content).toContain("Some body content without semantic elements")
+ })
+
+ it("should handle missing content-type header", async () => {
+ vi.mocked(axios.get).mockResolvedValueOnce({
+ headers: {},
+ data: "Some raw content",
+ })
+
+ const result = await fetchUrlContent("https://example.com/unknown")
+
+ // With no content-type, it falls through to the non-HTML path
+ expect(result.content).toBe("Some raw content")
+ })
+})
diff --git a/src/core/mentions/__tests__/index.spec.ts b/src/core/mentions/__tests__/index.spec.ts
index fa96a396dc..4f40cc5ab0 100644
--- a/src/core/mentions/__tests__/index.spec.ts
+++ b/src/core/mentions/__tests__/index.spec.ts
@@ -16,15 +16,57 @@ vi.mock("../../../i18n", () => ({
t: vi.fn((key: string) => key),
}))
+// Mock fetchUrlContent
+vi.mock("../fetchUrlContent", () => ({
+ fetchUrlContent: vi.fn().mockResolvedValue({
+ url: "https://example.com",
+ content: "Example page content here",
+ truncated: false,
+ }),
+}))
+
describe("parseMentions - URL mention handling", () => {
beforeEach(() => {
vi.clearAllMocks()
})
- it("should replace URL mentions with quoted URL reference", async () => {
+ it("should replace URL mentions with quoted URL reference indicating content", async () => {
+ const result = await parseMentions("Check @https://example.com", "/test")
+
+ expect(result.text).toContain("'https://example.com' (see below for fetched content)")
+ })
+
+ it("should produce a content block with fetched URL content", async () => {
+ const result = await parseMentions("Check @https://example.com", "/test")
+
+ expect(result.contentBlocks).toHaveLength(1)
+ expect(result.contentBlocks[0].type).toBe("url")
+ expect(result.contentBlocks[0].content).toContain("Example page content here")
+ expect(result.contentBlocks[0].content).toContain("[url_content for 'https://example.com']")
+ })
+
+ it("should handle URL fetch errors gracefully", async () => {
+ const { fetchUrlContent } = await import("../fetchUrlContent")
+ vi.mocked(fetchUrlContent).mockRejectedValueOnce(new Error("Network timeout"))
+
+ const result = await parseMentions("Check @https://example.com", "/test")
+
+ expect(result.contentBlocks).toHaveLength(1)
+ expect(result.contentBlocks[0].type).toBe("url")
+ expect(result.contentBlocks[0].content).toContain("Error fetching URL content: Network timeout")
+ })
+
+ it("should indicate truncation when content is truncated", async () => {
+ const { fetchUrlContent } = await import("../fetchUrlContent")
+ vi.mocked(fetchUrlContent).mockResolvedValueOnce({
+ url: "https://example.com",
+ content: "Truncated content...",
+ truncated: true,
+ })
+
const result = await parseMentions("Check @https://example.com", "/test")
- // URL mentions are now replaced with a quoted reference (no fetching)
- expect(result.text).toContain("'https://example.com'")
+ expect(result.contentBlocks).toHaveLength(1)
+ expect(result.contentBlocks[0].content).toContain("[Content truncated due to length]")
})
})
diff --git a/src/core/mentions/fetchUrlContent.ts b/src/core/mentions/fetchUrlContent.ts
new file mode 100644
index 0000000000..6761bcddc7
--- /dev/null
+++ b/src/core/mentions/fetchUrlContent.ts
@@ -0,0 +1,77 @@
+import axios from "axios"
+import * as cheerio from "cheerio"
+
+const MAX_CONTENT_LENGTH = 50_000
+const REQUEST_TIMEOUT_MS = 15_000
+
+export interface FetchUrlResult {
+ url: string
+ content: string
+ truncated: boolean
+}
+
+/**
+ * Fetches a URL and extracts readable text content from the HTML.
+ * Uses cheerio for HTML parsing and text extraction.
+ * Falls back to raw text for non-HTML responses.
+ */
+export async function fetchUrlContent(url: string): Promise {
+ const response = await axios.get(url, {
+ timeout: REQUEST_TIMEOUT_MS,
+ maxRedirects: 5,
+ responseType: "text",
+ headers: {
+ "User-Agent": "Roo-Code/1.0 (URL Context Fetcher)",
+ Accept: "text/html, application/xhtml+xml, text/plain, */*",
+ },
+ // Limit response size to avoid downloading huge files
+ maxContentLength: 5 * 1024 * 1024, // 5MB
+ })
+
+ const contentType = response.headers["content-type"] || ""
+ const rawBody = typeof response.data === "string" ? response.data : String(response.data)
+
+ let text: string
+
+ if (contentType.includes("text/html") || contentType.includes("application/xhtml")) {
+ text = extractTextFromHtml(rawBody)
+ } else {
+ // For non-HTML content (plain text, JSON, etc.), use raw body
+ text = rawBody
+ }
+
+ const truncated = text.length > MAX_CONTENT_LENGTH
+ if (truncated) {
+ text = text.slice(0, MAX_CONTENT_LENGTH)
+ }
+
+ return { url, content: text, truncated }
+}
+
+/**
+ * Extracts meaningful text content from an HTML string using cheerio.
+ * Removes scripts, styles, navigation, and other non-content elements.
+ */
+function extractTextFromHtml(html: string): string {
+ const $ = cheerio.load(html)
+
+ // Remove non-content elements
+ $(
+ "script, style, nav, footer, header, noscript, svg, iframe, form, button, [role='navigation'], [role='banner'], [role='contentinfo'], [aria-hidden='true']",
+ ).remove()
+
+ // Try to find main content area first
+ let contentEl = $("main, article, [role='main'], .content, #content, .post, .article")
+ if (contentEl.length === 0) {
+ contentEl = $("body")
+ }
+
+ // Extract text, preserving some structure
+ const text = contentEl
+ .text()
+ .replace(/[ \t]+/g, " ") // Collapse horizontal whitespace
+ .replace(/\n{3,}/g, "\n\n") // Collapse excessive newlines
+ .trim()
+
+ return text
+}
diff --git a/src/core/mentions/index.ts b/src/core/mentions/index.ts
index 1bfb90d23f..f08f1476b2 100644
--- a/src/core/mentions/index.ts
+++ b/src/core/mentions/index.ts
@@ -19,6 +19,7 @@ import { RooIgnoreController } from "../ignore/RooIgnoreController"
import { getCommand, type Command } from "../../services/command/commands"
import { buildSkillResult, resolveSkillContentForMode, type SkillLookup } from "../../services/skills/skillInvocation"
import type { SkillContent } from "../../shared/skills"
+import { fetchUrlContent } from "./fetchUrlContent"
export async function openMention(cwd: string, mention?: string): Promise {
if (!mention) {
@@ -163,7 +164,7 @@ export async function parseMentions(
parsedText = parsedText.replace(mentionRegexGlobal, (match, mention) => {
mentions.add(mention)
if (mention.startsWith("http")) {
- return `'${mention}'`
+ return `'${mention}' (see below for fetched content)`
} else if (mention.startsWith("/")) {
// Clean path reference - no "see below" since we format like tool results
const mentionPath = mention.slice(1)
@@ -221,6 +222,21 @@ export async function parseMentions(
} catch (error) {
parsedText += `\n\n\nError fetching commit info: ${error.message}\n`
}
+ } else if (mention.startsWith("http")) {
+ try {
+ const result = await fetchUrlContent(mention)
+ const truncationNote = result.truncated ? "\n[Content truncated due to length]" : ""
+ contentBlocks.push({
+ type: "url",
+ content: `[url_content for '${mention}']\n${result.content}${truncationNote}`,
+ })
+ } catch (error) {
+ const errorMsg = error instanceof Error ? error.message : String(error)
+ contentBlocks.push({
+ type: "url",
+ content: `[url_content for '${mention}']\nError fetching URL content: ${errorMsg}`,
+ })
+ }
} else if (mention === "terminal") {
try {
const terminalOutput = await getLatestTerminalOutput()