diff --git a/src/core/mentions/__tests__/fetchUrlContent.spec.ts b/src/core/mentions/__tests__/fetchUrlContent.spec.ts new file mode 100644 index 0000000000..571135a386 --- /dev/null +++ b/src/core/mentions/__tests__/fetchUrlContent.spec.ts @@ -0,0 +1,118 @@ +// npx vitest core/mentions/__tests__/fetchUrlContent.spec.ts + +import axios from "axios" + +import { fetchUrlContent } from "../fetchUrlContent" + +vi.mock("axios") + +describe("fetchUrlContent", () => { + beforeEach(() => { + vi.clearAllMocks() + }) + + it("should fetch and extract text from HTML content", async () => { + vi.mocked(axios.get).mockResolvedValueOnce({ + headers: { "content-type": "text/html; charset=utf-8" }, + data: ` + + Test Page + + + + +
+

Hello World

+

This is the main content of the page.

+
+ + + + `, + }) + + const result = await fetchUrlContent("https://example.com") + + expect(result.url).toBe("https://example.com") + expect(result.content).toContain("Hello World") + expect(result.content).toContain("This is the main content of the page.") + // Script/style/nav/footer should be removed + expect(result.content).not.toContain("ignore me") + expect(result.content).not.toContain("Navigation links") + expect(result.content).not.toContain("Footer content") + expect(result.truncated).toBe(false) + }) + + it("should return raw text for non-HTML content", async () => { + vi.mocked(axios.get).mockResolvedValueOnce({ + headers: { "content-type": "text/plain" }, + data: "Plain text content from the URL", + }) + + const result = await fetchUrlContent("https://example.com/file.txt") + + expect(result.content).toBe("Plain text content from the URL") + expect(result.truncated).toBe(false) + }) + + it("should handle JSON content type as raw text", async () => { + vi.mocked(axios.get).mockResolvedValueOnce({ + headers: { "content-type": "application/json" }, + data: '{"key": "value"}', + }) + + const result = await fetchUrlContent("https://example.com/api/data") + + expect(result.content).toBe('{"key": "value"}') + }) + + it("should truncate content that exceeds the max length", async () => { + const longContent = "x".repeat(60_000) + vi.mocked(axios.get).mockResolvedValueOnce({ + headers: { "content-type": "text/plain" }, + data: longContent, + }) + + const result = await fetchUrlContent("https://example.com/large") + + expect(result.truncated).toBe(true) + expect(result.content.length).toBe(50_000) + }) + + it("should propagate axios errors", async () => { + vi.mocked(axios.get).mockRejectedValueOnce(new Error("Request failed with status code 404")) + + await expect(fetchUrlContent("https://example.com/not-found")).rejects.toThrow( + "Request failed with status code 404", + ) + }) + + it("should use body as fallback when no main/article element exists", async () => { + vi.mocked(axios.get).mockResolvedValueOnce({ + headers: { "content-type": "text/html" }, + data: ` + + +
Some body content without semantic elements
+ + + `, + }) + + const result = await fetchUrlContent("https://example.com/simple") + + expect(result.content).toContain("Some body content without semantic elements") + }) + + it("should handle missing content-type header", async () => { + vi.mocked(axios.get).mockResolvedValueOnce({ + headers: {}, + data: "Some raw content", + }) + + const result = await fetchUrlContent("https://example.com/unknown") + + // With no content-type, it falls through to the non-HTML path + expect(result.content).toBe("Some raw content") + }) +}) diff --git a/src/core/mentions/__tests__/index.spec.ts b/src/core/mentions/__tests__/index.spec.ts index fa96a396dc..4f40cc5ab0 100644 --- a/src/core/mentions/__tests__/index.spec.ts +++ b/src/core/mentions/__tests__/index.spec.ts @@ -16,15 +16,57 @@ vi.mock("../../../i18n", () => ({ t: vi.fn((key: string) => key), })) +// Mock fetchUrlContent +vi.mock("../fetchUrlContent", () => ({ + fetchUrlContent: vi.fn().mockResolvedValue({ + url: "https://example.com", + content: "Example page content here", + truncated: false, + }), +})) + describe("parseMentions - URL mention handling", () => { beforeEach(() => { vi.clearAllMocks() }) - it("should replace URL mentions with quoted URL reference", async () => { + it("should replace URL mentions with quoted URL reference indicating content", async () => { + const result = await parseMentions("Check @https://example.com", "/test") + + expect(result.text).toContain("'https://example.com' (see below for fetched content)") + }) + + it("should produce a content block with fetched URL content", async () => { + const result = await parseMentions("Check @https://example.com", "/test") + + expect(result.contentBlocks).toHaveLength(1) + expect(result.contentBlocks[0].type).toBe("url") + expect(result.contentBlocks[0].content).toContain("Example page content here") + expect(result.contentBlocks[0].content).toContain("[url_content for 'https://example.com']") + }) + + it("should handle URL fetch errors gracefully", async () => { + const { fetchUrlContent } = await import("../fetchUrlContent") + vi.mocked(fetchUrlContent).mockRejectedValueOnce(new Error("Network timeout")) + + const result = await parseMentions("Check @https://example.com", "/test") + + expect(result.contentBlocks).toHaveLength(1) + expect(result.contentBlocks[0].type).toBe("url") + expect(result.contentBlocks[0].content).toContain("Error fetching URL content: Network timeout") + }) + + it("should indicate truncation when content is truncated", async () => { + const { fetchUrlContent } = await import("../fetchUrlContent") + vi.mocked(fetchUrlContent).mockResolvedValueOnce({ + url: "https://example.com", + content: "Truncated content...", + truncated: true, + }) + const result = await parseMentions("Check @https://example.com", "/test") - // URL mentions are now replaced with a quoted reference (no fetching) - expect(result.text).toContain("'https://example.com'") + expect(result.contentBlocks).toHaveLength(1) + expect(result.contentBlocks[0].content).toContain("[Content truncated due to length]") }) }) diff --git a/src/core/mentions/fetchUrlContent.ts b/src/core/mentions/fetchUrlContent.ts new file mode 100644 index 0000000000..6761bcddc7 --- /dev/null +++ b/src/core/mentions/fetchUrlContent.ts @@ -0,0 +1,77 @@ +import axios from "axios" +import * as cheerio from "cheerio" + +const MAX_CONTENT_LENGTH = 50_000 +const REQUEST_TIMEOUT_MS = 15_000 + +export interface FetchUrlResult { + url: string + content: string + truncated: boolean +} + +/** + * Fetches a URL and extracts readable text content from the HTML. + * Uses cheerio for HTML parsing and text extraction. + * Falls back to raw text for non-HTML responses. + */ +export async function fetchUrlContent(url: string): Promise { + const response = await axios.get(url, { + timeout: REQUEST_TIMEOUT_MS, + maxRedirects: 5, + responseType: "text", + headers: { + "User-Agent": "Roo-Code/1.0 (URL Context Fetcher)", + Accept: "text/html, application/xhtml+xml, text/plain, */*", + }, + // Limit response size to avoid downloading huge files + maxContentLength: 5 * 1024 * 1024, // 5MB + }) + + const contentType = response.headers["content-type"] || "" + const rawBody = typeof response.data === "string" ? response.data : String(response.data) + + let text: string + + if (contentType.includes("text/html") || contentType.includes("application/xhtml")) { + text = extractTextFromHtml(rawBody) + } else { + // For non-HTML content (plain text, JSON, etc.), use raw body + text = rawBody + } + + const truncated = text.length > MAX_CONTENT_LENGTH + if (truncated) { + text = text.slice(0, MAX_CONTENT_LENGTH) + } + + return { url, content: text, truncated } +} + +/** + * Extracts meaningful text content from an HTML string using cheerio. + * Removes scripts, styles, navigation, and other non-content elements. + */ +function extractTextFromHtml(html: string): string { + const $ = cheerio.load(html) + + // Remove non-content elements + $( + "script, style, nav, footer, header, noscript, svg, iframe, form, button, [role='navigation'], [role='banner'], [role='contentinfo'], [aria-hidden='true']", + ).remove() + + // Try to find main content area first + let contentEl = $("main, article, [role='main'], .content, #content, .post, .article") + if (contentEl.length === 0) { + contentEl = $("body") + } + + // Extract text, preserving some structure + const text = contentEl + .text() + .replace(/[ \t]+/g, " ") // Collapse horizontal whitespace + .replace(/\n{3,}/g, "\n\n") // Collapse excessive newlines + .trim() + + return text +} diff --git a/src/core/mentions/index.ts b/src/core/mentions/index.ts index 1bfb90d23f..f08f1476b2 100644 --- a/src/core/mentions/index.ts +++ b/src/core/mentions/index.ts @@ -19,6 +19,7 @@ import { RooIgnoreController } from "../ignore/RooIgnoreController" import { getCommand, type Command } from "../../services/command/commands" import { buildSkillResult, resolveSkillContentForMode, type SkillLookup } from "../../services/skills/skillInvocation" import type { SkillContent } from "../../shared/skills" +import { fetchUrlContent } from "./fetchUrlContent" export async function openMention(cwd: string, mention?: string): Promise { if (!mention) { @@ -163,7 +164,7 @@ export async function parseMentions( parsedText = parsedText.replace(mentionRegexGlobal, (match, mention) => { mentions.add(mention) if (mention.startsWith("http")) { - return `'${mention}'` + return `'${mention}' (see below for fetched content)` } else if (mention.startsWith("/")) { // Clean path reference - no "see below" since we format like tool results const mentionPath = mention.slice(1) @@ -221,6 +222,21 @@ export async function parseMentions( } catch (error) { parsedText += `\n\n\nError fetching commit info: ${error.message}\n` } + } else if (mention.startsWith("http")) { + try { + const result = await fetchUrlContent(mention) + const truncationNote = result.truncated ? "\n[Content truncated due to length]" : "" + contentBlocks.push({ + type: "url", + content: `[url_content for '${mention}']\n${result.content}${truncationNote}`, + }) + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error) + contentBlocks.push({ + type: "url", + content: `[url_content for '${mention}']\nError fetching URL content: ${errorMsg}`, + }) + } } else if (mention === "terminal") { try { const terminalOutput = await getLatestTerminalOutput()