From 50019b486699901b508296ce3cfad5cf111c6438 Mon Sep 17 00:00:00 2001 From: Gabbar Singh Date: Wed, 13 May 2026 12:38:55 +0300 Subject: [PATCH] feat(jsluice): add URL verification via httpx to filter noise and dead endpoints jsluice extracts many URLs from JavaScript sources, including bundled libraries, static assets, and sourcemaps that are not useful for recon. This adds an httpx-based verification step that probes candidate URLs and only publishes live endpoints to the knowledge graph. Includes configurable noise-filter patterns, fail-closed behavior on probe errors, and full test coverage. Co-Authored-By: Claude Opus 4.6 (1M context) --- recon/helpers/resource_enum/__init__.py | 6 + .../helpers/resource_enum/jsluice_helpers.py | 172 +++++++++++++++++- recon/main_recon_modules/resource_enum.py | 59 +++++- recon/project_settings.py | 27 +++ recon/tests/test_hakrawler_jsluice.py | 90 +++++++++ 5 files changed, 352 insertions(+), 2 deletions(-) diff --git a/recon/helpers/resource_enum/__init__.py b/recon/helpers/resource_enum/__init__.py index b6a1f496..caa7115a 100644 --- a/recon/helpers/resource_enum/__init__.py +++ b/recon/helpers/resource_enum/__init__.py @@ -62,8 +62,11 @@ # jsluice helpers from .jsluice_helpers import ( + DEFAULT_JSLUICE_EXCLUDE_PATTERNS, + filter_jsluice_url, run_jsluice_analysis, merge_jsluice_into_by_base_url, + verify_jsluice_urls, ) # FFuf helpers @@ -123,8 +126,11 @@ "pull_hakrawler_docker_image", "merge_hakrawler_into_by_base_url", # jsluice + "DEFAULT_JSLUICE_EXCLUDE_PATTERNS", + "filter_jsluice_url", "run_jsluice_analysis", "merge_jsluice_into_by_base_url", + "verify_jsluice_urls", # FFuf "run_ffuf_discovery", "pull_ffuf_binary_check", diff --git a/recon/helpers/resource_enum/jsluice_helpers.py b/recon/helpers/resource_enum/jsluice_helpers.py index 345a1990..4bbcb95b 100644 --- a/recon/helpers/resource_enum/jsluice_helpers.py +++ b/recon/helpers/resource_enum/jsluice_helpers.py @@ -11,12 +11,182 @@ import ssl import subprocess import urllib.request +import uuid from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Set, Tuple from urllib.parse import urlparse, urljoin from concurrent.futures import ThreadPoolExecutor, as_completed +DEFAULT_JSLUICE_EXCLUDE_PATTERNS = [ + '/_next/image', '/_next/static', '/_next/data', '/__nextjs', + '/_nuxt/', '/__nuxt', + '/runtime.', '/polyfills.', '/vendor.', + '/webpack', '/chunk.', '.chunk.js', '.bundle.js', 'hot-update', + '/static/', '/public/', '/dist/', '/build/', '/lib/', '/vendor/', '/node_modules/', + '.js', '.mjs', '.map', '.css', '.scss', '.sass', '.less', + '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.webp', '.avif', + '.woff', '.woff2', '.ttf', '.eot', '.otf', + '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', + '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', + '.zip', '.rar', '.7z', '.tar', '.gz', + '/rxjs/', '/react/', '/angular/', '/lodash/', '/zone.js/', +] + + +def _create_temp_dir(prefix: str = "jsluice_verify") -> Path: + """Create a temp directory under /tmp/redamon for Docker-in-Docker compatibility.""" + temp_dir = Path(f"/tmp/redamon/.{prefix}_{uuid.uuid4().hex[:8]}") + temp_dir.mkdir(parents=True, exist_ok=True) + return temp_dir + + +def _cleanup_temp_dir(temp_dir: Path): + """Clean up a temp directory.""" + try: + if temp_dir.exists(): + shutil.rmtree(temp_dir) + except Exception: + pass + + +def filter_jsluice_url(url: str, exclude_patterns: List[str]) -> bool: + """ + Return True when a jsluice URL should be probed. + + jsluice often extracts library, bundle, sourcemap, and static asset paths + from JavaScript source. These are filtered before HTTP validation to avoid + spending probe budget on obvious non-application endpoints. + """ + if not url: + return False + + try: + url_lower = url.lower() + parsed = urlparse(url) + path_lower = (parsed.path or "").lower() + query_lower = (parsed.query or "").lower() + haystack = f"{url_lower} {path_lower} {query_lower}" + + return not any( + pattern and pattern.lower() in haystack + for pattern in exclude_patterns + ) + except Exception: + return False + + +def verify_jsluice_urls( + urls: List[str], + docker_image: str, + threads: int, + timeout: int, + rate_limit: int, + accept_status: List[int], + exclude_patterns: List[str] = None, + use_proxy: bool = False, +) -> Tuple[Set[str], Dict[str, int]]: + """ + Verify jsluice-discovered URLs are live using httpx. + + This verifier fails closed: if probing fails or times out, unverified + jsluice URLs are not returned for graph publication. + """ + exclude_patterns = exclude_patterns or [] + stats = { + "jsluice_verify_total": len(urls), + "jsluice_verify_candidates": 0, + "jsluice_skipped_blacklist": 0, + "jsluice_verified": 0, + "jsluice_skipped_unverified": 0, + } + + if not urls: + return set(), stats + + candidates = [] + for url in sorted(set(urls)): + if filter_jsluice_url(url, exclude_patterns): + candidates.append(url) + else: + stats["jsluice_skipped_blacklist"] += 1 + + stats["jsluice_verify_candidates"] = len(candidates) + if not candidates: + stats["jsluice_skipped_unverified"] = 0 + print(f"[*][jsluice] Verification skipped: all {len(urls)} URLs matched noise filters") + return set(), stats + + print(f"\n[*][jsluice] Verifying {len(candidates)} jsluice URLs...") + if stats["jsluice_skipped_blacklist"]: + print(f"[*][jsluice] Skipped {stats['jsluice_skipped_blacklist']} URLs via noise filters") + + temp_dir = _create_temp_dir("jsluice_verify") + try: + urls_file = temp_dir / "urls.txt" + output_file = temp_dir / "verified.json" + + with open(urls_file, 'w') as f: + for url in candidates: + f.write(f"{url}\n") + + cmd = [ + "docker", "run", "--rm", + "-v", f"{temp_dir}:/data", + docker_image, + "-l", "/data/urls.txt", + "-o", "/data/verified.json", + "-json", + "-silent", + "-nc", + "-t", str(threads), + "-timeout", str(timeout), + "-rl", str(rate_limit), + ] + + if use_proxy: + cmd.extend(["-proxy", "socks5://127.0.0.1:9050"]) + + try: + subprocess.run(cmd, capture_output=True, text=True, timeout=300) + except subprocess.TimeoutExpired: + print("[!][jsluice] URL verification timeout; dropping unverified jsluice URLs") + stats["jsluice_skipped_unverified"] = len(candidates) + return set(), stats + except Exception as e: + print(f"[!][jsluice] URL verification error: {e}; dropping unverified jsluice URLs") + stats["jsluice_skipped_unverified"] = len(candidates) + return set(), stats + + verified = set() + accept_codes = {int(code) for code in accept_status} + + if output_file.exists(): + with open(output_file, 'r') as f: + for line in f: + try: + entry = json.loads(line.strip()) + except json.JSONDecodeError: + continue + + url = entry.get('url', '') + status = entry.get('status_code') or entry.get('status-code') + try: + status = int(status) + except (TypeError, ValueError): + continue + + if url and status in accept_codes: + verified.add(url) + + stats["jsluice_verified"] = len(verified) + stats["jsluice_skipped_unverified"] = len(candidates) - len(verified) + print(f"[+][jsluice] Verified: {len(verified)}/{len(candidates)} URLs are live") + return verified, stats + finally: + _cleanup_temp_dir(temp_dir) + + def _extract_urls_for_base(base_url, file_entries, concurrency, timeout, allowed_hosts): """Extract URLs from JS files for a single base URL.""" extracted_urls = [] diff --git a/recon/main_recon_modules/resource_enum.py b/recon/main_recon_modules/resource_enum.py index 114e0ad2..be9109fc 100644 --- a/recon/main_recon_modules/resource_enum.py +++ b/recon/main_recon_modules/resource_enum.py @@ -68,8 +68,10 @@ pull_hakrawler_docker_image, merge_hakrawler_into_by_base_url, # jsluice helpers + DEFAULT_JSLUICE_EXCLUDE_PATTERNS, run_jsluice_analysis, merge_jsluice_into_by_base_url, + verify_jsluice_urls, # FFuf helpers run_ffuf_discovery, pull_ffuf_binary_check, @@ -165,6 +167,9 @@ def run_resource_enum(recon_data: dict, output_file: Optional[Path] = None, sett ("JSLUICE_ENABLED", "jsluice"), ("JSLUICE_MAX_FILES", "jsluice"), ("JSLUICE_PARALLELISM", "jsluice"), + ("JSLUICE_VERIFY_URLS", "jsluice"), + ("JSLUICE_VERIFY_RATE_LIMIT", "jsluice"), + ("JSLUICE_VERIFY_THREADS", "jsluice"), ("ARJUN_ENABLED", "Arjun"), ("ARJUN_THREADS", "Arjun"), ("ARJUN_RATE_LIMIT", "Arjun"), @@ -216,6 +221,19 @@ def run_resource_enum(recon_data: dict, output_file: Optional[Path] = None, sett JSLUICE_EXTRACT_SECRETS = settings.get('JSLUICE_EXTRACT_SECRETS', True) JSLUICE_CONCURRENCY = settings.get('JSLUICE_CONCURRENCY', 5) JSLUICE_PARALLELISM = settings.get('JSLUICE_PARALLELISM', 3) + JSLUICE_VERIFY_URLS = settings.get('JSLUICE_VERIFY_URLS', True) + JSLUICE_VERIFY_DOCKER_IMAGE = settings.get('JSLUICE_VERIFY_DOCKER_IMAGE', 'projectdiscovery/httpx:latest') + JSLUICE_VERIFY_TIMEOUT = settings.get('JSLUICE_VERIFY_TIMEOUT', 5) + JSLUICE_VERIFY_RATE_LIMIT = settings.get('JSLUICE_VERIFY_RATE_LIMIT', 50) + JSLUICE_VERIFY_THREADS = settings.get('JSLUICE_VERIFY_THREADS', 50) + JSLUICE_VERIFY_ACCEPT_STATUS = settings.get( + 'JSLUICE_VERIFY_ACCEPT_STATUS', + [200, 201, 301, 302, 307, 308, 401, 403] + ) + JSLUICE_EXCLUDE_PATTERNS = list(settings.get( + 'JSLUICE_EXCLUDE_PATTERNS', + DEFAULT_JSLUICE_EXCLUDE_PATTERNS, + )) # FFuf settings FFUF_ENABLED = settings.get('FFUF_ENABLED', False) @@ -415,6 +433,12 @@ def run_resource_enum(recon_data: dict, output_file: Optional[Path] = None, sett print(f"[*][jsluice] Extract URLs: {JSLUICE_EXTRACT_URLS}") print(f"[*][jsluice] Extract secrets: {JSLUICE_EXTRACT_SECRETS}") print(f"[*][jsluice] Parallelism: {JSLUICE_PARALLELISM} concurrent base URLs") + print(f"[*][jsluice] URL verification: {JSLUICE_VERIFY_URLS}") + if JSLUICE_VERIFY_URLS: + print(f"[*][jsluice] Verify rate limit: {JSLUICE_VERIFY_RATE_LIMIT} req/s") + print(f"[*][jsluice] Verify threads: {JSLUICE_VERIFY_THREADS}") + print(f"[*][jsluice] Verify timeout: {JSLUICE_VERIFY_TIMEOUT}s") + print(f"[*][jsluice] Noise filter patterns: {len(JSLUICE_EXCLUDE_PATTERNS)}") # FFuf settings print(f"[*][FFuf] Enabled: {FFUF_ENABLED}") if FFUF_ENABLED: @@ -693,11 +717,17 @@ def _run_kr_wordlist(wordlist_name): "jsluice_parsed": 0, "jsluice_new": 0, "jsluice_overlap": 0, + "jsluice_verify_total": 0, + "jsluice_verify_candidates": 0, + "jsluice_skipped_blacklist": 0, + "jsluice_verified": 0, + "jsluice_skipped_unverified": 0, } if JSLUICE_ENABLED and (JSLUICE_EXTRACT_URLS or JSLUICE_EXTRACT_SECRETS): all_crawl_urls = list(set(katana_urls + hakrawler_urls)) if all_crawl_urls: + verify_stats = {} jsluice_result = run_jsluice_analysis( all_crawl_urls, JSLUICE_MAX_FILES, @@ -710,15 +740,41 @@ def _run_kr_wordlist(wordlist_name): use_proxy ) + if jsluice_result.get("urls"): + if JSLUICE_VERIFY_URLS: + verified_jsluice_urls, verify_stats = verify_jsluice_urls( + jsluice_result["urls"], + JSLUICE_VERIFY_DOCKER_IMAGE, + JSLUICE_VERIFY_THREADS, + JSLUICE_VERIFY_TIMEOUT, + JSLUICE_VERIFY_RATE_LIMIT, + JSLUICE_VERIFY_ACCEPT_STATUS, + JSLUICE_EXCLUDE_PATTERNS, + use_proxy, + ) + jsluice_result["urls"] = sorted(verified_jsluice_urls) + jsluice_stats.update(verify_stats) + else: + jsluice_stats["jsluice_verify_total"] = len(jsluice_result["urls"]) + jsluice_stats["jsluice_verify_candidates"] = len(jsluice_result["urls"]) + jsluice_stats["jsluice_verified"] = len(jsluice_result["urls"]) + if jsluice_result.get("urls"): print("\n[*][jsluice] Merging extracted URLs into results...") - organized_data['by_base_url'], jsluice_stats = merge_jsluice_into_by_base_url( + organized_data['by_base_url'], merge_stats = merge_jsluice_into_by_base_url( jsluice_result["urls"], organized_data['by_base_url'], ) + jsluice_stats.update(merge_stats) + jsluice_stats.update(verify_stats) print(f"[+][jsluice] Total URLs: {jsluice_stats['jsluice_total']}") print(f"[+][jsluice] New endpoints: {jsluice_stats['jsluice_new']}") print(f"[+][jsluice] Overlap: {jsluice_stats['jsluice_overlap']}") + if JSLUICE_VERIFY_URLS: + print(f"[+][jsluice] Skipped (blacklist): {jsluice_stats['jsluice_skipped_blacklist']}") + print(f"[+][jsluice] Skipped (unverified): {jsluice_stats['jsluice_skipped_unverified']}") + elif JSLUICE_VERIFY_URLS and jsluice_stats.get("jsluice_verify_total", 0) > 0: + print(f"[-][jsluice] No URLs survived validation ({jsluice_stats['jsluice_skipped_blacklist']} blacklisted, {jsluice_stats['jsluice_skipped_unverified']} unverified)") # FFuf directory fuzzing (runs after crawlers and jsluice, before GAU merge) ffuf_stats = { @@ -1069,6 +1125,7 @@ def _run_kr_wordlist(wordlist_name): # jsluice metadata 'jsluice_enabled': JSLUICE_ENABLED, 'jsluice_max_files': JSLUICE_MAX_FILES if JSLUICE_ENABLED else None, + 'jsluice_verify_enabled': JSLUICE_VERIFY_URLS if JSLUICE_ENABLED else False, 'jsluice_urls_found': len(jsluice_in_scope_urls), 'jsluice_secrets_found': len(jsluice_result.get("secrets", [])), 'jsluice_stats': jsluice_stats, diff --git a/recon/project_settings.py b/recon/project_settings.py index 526f2d25..ef125edf 100644 --- a/recon/project_settings.py +++ b/recon/project_settings.py @@ -337,6 +337,26 @@ 'JSLUICE_EXTRACT_SECRETS': True, 'JSLUICE_CONCURRENCY': 5, 'JSLUICE_PARALLELISM': 5, + 'JSLUICE_VERIFY_URLS': True, + 'JSLUICE_VERIFY_DOCKER_IMAGE': 'projectdiscovery/httpx:latest', + 'JSLUICE_VERIFY_TIMEOUT': 5, + 'JSLUICE_VERIFY_RATE_LIMIT': 50, + 'JSLUICE_VERIFY_THREADS': 50, + 'JSLUICE_VERIFY_ACCEPT_STATUS': [200, 201, 301, 302, 307, 308, 401, 403], + 'JSLUICE_EXCLUDE_PATTERNS': [ + '/_next/image', '/_next/static', '/_next/data', '/__nextjs', + '/_nuxt/', '/__nuxt', + '/runtime.', '/polyfills.', '/vendor.', + '/webpack', '/chunk.', '.chunk.js', '.bundle.js', 'hot-update', + '/static/', '/public/', '/dist/', '/build/', '/lib/', '/vendor/', '/node_modules/', + '.js', '.mjs', '.map', '.css', '.scss', '.sass', '.less', + '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.webp', '.avif', + '.woff', '.woff2', '.ttf', '.eot', '.otf', + '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', + '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', + '.zip', '.rar', '.7z', '.tar', '.gz', + '/rxjs/', '/react/', '/angular/', '/lodash/', '/zone.js/', + ], # ========== JS RECON SCANNER ========== 'JS_RECON_ENABLED': False, @@ -882,6 +902,13 @@ def fetch_project_settings(project_id: str, webapp_url: str) -> dict[str, Any]: settings['JSLUICE_EXTRACT_SECRETS'] = project.get('jsluiceExtractSecrets', DEFAULT_SETTINGS['JSLUICE_EXTRACT_SECRETS']) settings['JSLUICE_CONCURRENCY'] = project.get('jsluiceConcurrency', DEFAULT_SETTINGS['JSLUICE_CONCURRENCY']) settings['JSLUICE_PARALLELISM'] = project.get('jsluiceParallelism', DEFAULT_SETTINGS['JSLUICE_PARALLELISM']) + settings['JSLUICE_VERIFY_URLS'] = project.get('jsluiceVerifyUrls', DEFAULT_SETTINGS['JSLUICE_VERIFY_URLS']) + settings['JSLUICE_VERIFY_DOCKER_IMAGE'] = project.get('jsluiceVerifyDockerImage', DEFAULT_SETTINGS['JSLUICE_VERIFY_DOCKER_IMAGE']) + settings['JSLUICE_VERIFY_TIMEOUT'] = project.get('jsluiceVerifyTimeout', DEFAULT_SETTINGS['JSLUICE_VERIFY_TIMEOUT']) + settings['JSLUICE_VERIFY_RATE_LIMIT'] = project.get('jsluiceVerifyRateLimit', DEFAULT_SETTINGS['JSLUICE_VERIFY_RATE_LIMIT']) + settings['JSLUICE_VERIFY_THREADS'] = project.get('jsluiceVerifyThreads', DEFAULT_SETTINGS['JSLUICE_VERIFY_THREADS']) + settings['JSLUICE_VERIFY_ACCEPT_STATUS'] = project.get('jsluiceVerifyAcceptStatus', DEFAULT_SETTINGS['JSLUICE_VERIFY_ACCEPT_STATUS']) + settings['JSLUICE_EXCLUDE_PATTERNS'] = project.get('jsluiceExcludePatterns', DEFAULT_SETTINGS['JSLUICE_EXCLUDE_PATTERNS']) # JS Recon Scanner settings['JS_RECON_ENABLED'] = project.get('jsReconEnabled', DEFAULT_SETTINGS['JS_RECON_ENABLED']) diff --git a/recon/tests/test_hakrawler_jsluice.py b/recon/tests/test_hakrawler_jsluice.py index 14419e21..e1ff91b0 100644 --- a/recon/tests/test_hakrawler_jsluice.py +++ b/recon/tests/test_hakrawler_jsluice.py @@ -8,6 +8,7 @@ import sys import json import subprocess +import tempfile from pathlib import Path from unittest import mock @@ -473,6 +474,95 @@ def test_jsluice_run_filters_scope_and_cleans_up(): print("PASS: test_jsluice_run_filters_scope_and_cleans_up") +def test_jsluice_filter_url_rejects_common_static_library_noise(): + """filter_jsluice_url should skip obvious bundled library/static paths.""" + from recon.helpers.resource_enum.jsluice_helpers import filter_jsluice_url + + patterns = [ + "/rxjs/", + "/node_modules/", + "/webpack", + ".map", + ".chunk.js", + ] + + assert filter_jsluice_url("https://example.com/rxjs/static-5.10", patterns) is False + assert filter_jsluice_url("https://example.com/node_modules/lodash/index.js", patterns) is False + assert filter_jsluice_url("https://example.com/_next/static/chunks/app.chunk.js", patterns) is False + assert filter_jsluice_url("https://example.com/api/users", patterns) is True + assert filter_jsluice_url("https://example.com/dashboard/settings", patterns) is True + print("PASS: test_jsluice_filter_url_rejects_common_static_library_noise") + + +def test_verify_jsluice_urls_filters_noise_and_unverified(): + """verify_jsluice_urls should blacklist noise and keep only accepted HTTP statuses.""" + from recon.helpers.resource_enum.jsluice_helpers import verify_jsluice_urls + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + output_file = temp_path / "verified.json" + + def fake_run(cmd, **kwargs): + output_file.write_text( + "\n".join([ + json.dumps({"url": "https://example.com/api/live", "status_code": 200}), + json.dumps({"url": "https://example.com/api/missing", "status_code": 404}), + "", + ]) + ) + return mock.MagicMock(returncode=0, stdout="", stderr="") + + with mock.patch("recon.helpers.resource_enum.jsluice_helpers._create_temp_dir", return_value=temp_path), \ + mock.patch("recon.helpers.resource_enum.jsluice_helpers._cleanup_temp_dir"), \ + mock.patch("subprocess.run", side_effect=fake_run): + verified, stats = verify_jsluice_urls( + urls=[ + "https://example.com/api/live", + "https://example.com/api/missing", + "https://example.com/rxjs/static-5.10", + ], + docker_image="projectdiscovery/httpx:latest", + threads=10, + timeout=5, + rate_limit=50, + accept_status=[200, 201, 301, 302, 307, 308, 401, 403], + exclude_patterns=["/rxjs/"], + ) + + assert verified == {"https://example.com/api/live"} + assert stats["jsluice_verify_total"] == 3 + assert stats["jsluice_skipped_blacklist"] == 1 + assert stats["jsluice_verified"] == 1 + assert stats["jsluice_skipped_unverified"] == 1 + print("PASS: test_verify_jsluice_urls_filters_noise_and_unverified") + + +def test_verify_jsluice_urls_fails_closed_on_httpx_error(): + """If httpx verification fails, no jsluice URLs should be published.""" + from recon.helpers.resource_enum.jsluice_helpers import verify_jsluice_urls + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + with mock.patch("recon.helpers.resource_enum.jsluice_helpers._create_temp_dir", return_value=temp_path), \ + mock.patch("recon.helpers.resource_enum.jsluice_helpers._cleanup_temp_dir"), \ + mock.patch("subprocess.run", side_effect=RuntimeError("docker failed")): + verified, stats = verify_jsluice_urls( + urls=["https://example.com/api/live"], + docker_image="projectdiscovery/httpx:latest", + threads=10, + timeout=5, + rate_limit=50, + accept_status=[200, 201, 301, 302, 307, 308, 401, 403], + exclude_patterns=[], + ) + + assert verified == set() + assert stats["jsluice_verify_total"] == 1 + assert stats["jsluice_verified"] == 0 + assert stats["jsluice_skipped_unverified"] == 1 + print("PASS: test_verify_jsluice_urls_fails_closed_on_httpx_error") + + # =========================================================================== # jsluice merge tests # ===========================================================================