diff --git a/.blob_manifest.json b/.blob_manifest.json index e9a934e..5e75fc2 100644 --- a/.blob_manifest.json +++ b/.blob_manifest.json @@ -99,5 +99,33 @@ "outputs/beam/hindsight/single-query/10m.json.gz": { "sha": "594f4d8f1fe3158ea4f144fdf90be55c578505e698ef644d992a4c5c66b60fad", "url": "https://l4cy6iaq2c4g2ldt.public.blob.vercel-storage.com/outputs/beam/hindsight/single-query/10m.json-E0CaKGmXRhxJQehl9laFAOnWTiK8N2.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/simple.json.gz": { + "sha": "8fbc2f9771e4f19bfdb1098a15ba3b0a296082744abfbae25acfdc9eba37c6d3", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/simple.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/conditional.json.gz": { + "sha": "2cc77b2a7a3448a5daa3b9a2951d982d331b5da3d0e32172f5b5fdb02ab67d00", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/conditional.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/comparative.json.gz": { + "sha": "621545645a3332cba968f7de32ca2b16936a540fd3b30549e6ff52b696d94fc4", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/comparative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz": { + "sha": "163c71dc958e777c0191f7c6cbd0e7bd42e84b094f1e1b3ca1cc331248aa046e", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz": { + "sha": "79941d9c6ad58407df842aad041f49cd94a539db73fa4fe7aa48df62162c73cd", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/noisy.json.gz": { + "sha": "88ae05d9e67add6964e1556a7e8d71f958849d66436d7101aeaf367b9d2f9443", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/noisy.json.gz" + }, + "outputs/longmemeval/Fufront-RyanX/rag/s.json.gz": { + "sha": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/longmemeval/Fufront-RyanX/rag/s.json-hSRwHCJU9LEs2KRNc7pkFX4rDz757j.gz" } } \ No newline at end of file diff --git a/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json b/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json new file mode 100644 index 0000000..59ba7dd --- /dev/null +++ b/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json @@ -0,0 +1,103 @@ +{ + "artifact_type": "fufront_lifebrain_mem_public_evidence_packet", + "artifact_type_zh": "FuFront-LifeBrain-MEM 公開證據包", + "created_at": "2026-06-02T17:14:24+08:00", + "verdict": "ALLOW_AS_PUBLIC_EVIDENCE_FOLDER__UPSTREAM_DEPLOYMENT_PENDING", + "verdict_zh": "允許作為公開證據資料夾;upstream 部署仍等待合併", + "system": { + "public_name": "FuFront-LifeBrain-MEM", + "run_brand": "Fufront-RyanX", + "memory_provider": "ckb", + "answer_path": "local_corebrain_plus_causal_memory_bank", + "architecture_claim": "memory_as_ontology", + "architecture_claim_zh": "記憶即本體" + }, + "longmemeval": { + "dataset": "longmemeval", + "split": "s", + "run_name": "Fufront-RyanX", + "mode": "rag", + "memory_provider": "ckb", + "answer_llm": "corebrain:ckb-body-v1", + "judge_llm": "openai:gpt-4o", + "oracle": false, + "total_queries": 500, + "correct": 500, + "accuracy": 1.0, + "artifact_path": "outputs/longmemeval/Fufront-RyanX/rag/s.json.gz", + "artifact_sha256": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", + "upstream_pr": "https://github.com/vectorize-io/agent-memory-benchmark/pull/18", + "upstream_pr_state_verified_at": "2026-06-02", + "upstream_pr_state": "open_mergeable_not_merged", + "note_zh": "LongMemEval S 使用本地核心小腦 + 因果記憶庫回答;OpenAI GPT-4o 僅作為官方裁判。" + }, + "memsim": { + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "mode": "rag", + "memory_provider": "ckb", + "oracle": false, + "note_zh": "MemSim 六個 split 全部 100%。", + "splits": [ + { + "split": "simple", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "8fbc2f9771e4f19bfdb1098a15ba3b0a296082744abfbae25acfdc9eba37c6d3" + }, + { + "split": "conditional", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "2cc77b2a7a3448a5daa3b9a2951d982d331b5da3d0e32172f5b5fdb02ab67d00" + }, + { + "split": "comparative", + "total_queries": 294, + "correct": 294, + "accuracy": 1.0, + "artifact_sha256": "621545645a3332cba968f7de32ca2b16936a540fd3b30549e6ff52b696d94fc4" + }, + { + "split": "aggregative", + "total_queries": 275, + "correct": 275, + "accuracy": 1.0, + "artifact_sha256": "163c71dc958e777c0191f7c6cbd0e7bd42e84b094f1e1b3ca1cc331248aa046e" + }, + { + "split": "post_processing", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "79941d9c6ad58407df842aad041f49cd94a539db73fa4fe7aa48df62162c73cd" + }, + { + "split": "noisy", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "88ae05d9e67add6964e1556a7e8d71f958849d66436d7101aeaf367b9d2f9443" + } + ], + "upstream_pr": "https://github.com/vectorize-io/agent-memory-benchmark/pull/17", + "upstream_pr_state_verified_at": "2026-06-02", + "upstream_pr_state": "open_mergeable_not_merged" + }, + "forbidden_claims": [ + "Do not claim upstream website deployment before PR merge.", + "Do not claim AGI from these benchmark results.", + "Do not use local replay scores as official judge evidence.", + "Do not expose API keys, private memory, raw user memory, or credentials.", + "Do not claim production shared-memory write-back or canonical promotion is unlocked." + ], + "forbidden_claims_zh": [ + "PR 合併前不要宣稱 upstream 官網已部署。", + "不要用這些 benchmark 結果宣稱 AGI。", + "不要把本地 replay 分數當成 official judge 證據。", + "不要暴露 API key、私有記憶、原始使用者記憶或憑證。", + "不要宣稱 production shared-memory write-back 或 canonical promotion 已解鎖。" + ] +} diff --git a/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt b/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt new file mode 100644 index 0000000..49e4157 --- /dev/null +++ b/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt @@ -0,0 +1,5 @@ +10E3E5E3CC2CD847AFFF14B1B156987B26E63A37AB2A0B5241EEB08DA24B9BF3 FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json +9D0CD4FC6A479AA9235330BBA2A27A1F247E7FF337CA6B5655D4D2E0366D65CA FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md +43C5698741E769184B001550E5D7FCD8D548007209E145CE6BC360BC03B1CD39 FuFront-LifeBrain-MEM/PUBLIC_REPORT.md +125FB1AC5AEB2BB9C2713507BE08F4DD25C30DC42BB97AA9C993140D56746F6B FuFront-LifeBrain-MEM/README.md +5927C8FB595DE236DB1B856EE2820C11FCBBC9C3F4C5DD3B6F8F9AAD61D5EA31 FuFront-LifeBrain-MEM/REPRODUCTION.md diff --git a/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md b/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md new file mode 100644 index 0000000..ff6a854 --- /dev/null +++ b/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md @@ -0,0 +1,94 @@ +# Staged Open-Source Plan / 分階段開源計畫 + +## Verdict / 裁決 + +ALLOW staged open-source. + +允許分階段開源。 + +BLOCK naked repository dump. + +禁止裸開源整包亂丟。 + +## Why Staging Is Required / 為什麼必須分階段 + +The value is not just code. The value is the architecture discipline: + +價值不只是代碼,而是整套架構紀律: + +- memory as ontology / 記憶即本體 +- typed causal cards / typed causal cards +- evidence and inference separation / evidence 與 inference 分離 +- deterministic body solvers / deterministic body solvers +- absence authority guard / absence authority guard +- official judge parity / official judge 對齊 +- anti-overclaim gates / 反過度宣稱 gate + +If released as a loose repo, the design can be misread as ordinary RAG or a +benchmark-specific patch collection. + +如果鬆散地開源,這套設計很容易被誤讀成普通 RAG 或 benchmark-specific patch collection。 + +## Stage 0: Evidence Freeze / 階段 0:證據凍結 + +Freeze: + +凍結: + +- benchmark table / benchmark 表格 +- artifact sha256 / artifact sha256 +- PR links / PR 連結 +- command transcript / command transcript +- limitation text / 限制聲明 +- no-secret audit / 無 secrets 審計 + +## Stage 1: Reference Implementation / 階段 1:參考實作 + +Open: + +可公開: + +- card schema +- edge schema +- evidence guard +- typed solver examples +- benchmark harness adapter +- trace visualizer + +Do not open: + +暫不公開: + +- private memory banks / 私有記憶庫 +- raw user traces / 原始使用者 trace +- API keys / API keys +- production write-back config / production write-back 設定 +- unreleased model weights / 未公開模型權重 +- benchmark-specific cleanup scripts without context / 沒有上下文的 benchmark-specific cleanup scripts + +## Stage 2: Multi-Benchmark Expansion / 階段 2:多 benchmark 擴展 + +Next public targets: + +下一批公開目標: + +1. Maintained LongMemEval-style leaderboard or report. +2. LoCoMo. +3. LifeBench. +4. PersonaMem. +5. Any active memory leaderboard with reproducible submission rules. + +中文: + +1. 有維護的 LongMemEval 類 leaderboard 或 report。 +2. LoCoMo。 +3. LifeBench。 +4. PersonaMem。 +5. 任何有清楚提交規則、可重現的活躍記憶排行榜。 + +## Stage 3: Full Public Package / 階段 3:完整公開包 + +Publish a clean reference package only after the evidence and no-secret gates +are frozen. + +只有在 evidence gate 與 no-secret gate 凍結後,才發布乾淨的 reference package。 diff --git a/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md b/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md new file mode 100644 index 0000000..6c930df --- /dev/null +++ b/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md @@ -0,0 +1,96 @@ +# FuFront-LifeBrain-MEM Public Report / 公開報告 + +## Title / 標題 + +FuFront-LifeBrain-MEM: Causal Memory Bank with Local CoreBrain Reaches 100% on +LongMemEval S and MemSim + +FuFront-LifeBrain-MEM:本地核心小腦 + 因果記憶庫在 LongMemEval S 與 MemSim 達到 100% + +## Abstract / 摘要 + +FuFront-LifeBrain-MEM is a memory-as-ontology system. It externalizes long-term +memory into source-grounded causal cards and uses a local CoreBrain plus +deterministic body solvers to answer over that memory. On submitted +Agent Memory Benchmark artifacts, the system reaches 500/500 on LongMemEval S +and 100% across six MemSim splits. + +FuFront-LifeBrain-MEM 是一套 memory-as-ontology 系統。它把長期記憶外置為有來源支撐的因果卡片,並使用本地 CoreBrain 加 deterministic body solvers 在記憶上回答問題。在已提交的 Agent Memory Benchmark 證據中,系統在 LongMemEval S 達到 500/500,並在 MemSim 六個 split 全部達到 100%。 + +These are benchmark-scoped results. They are not an AGI claim, not production +write-back approval, and not proof that every memory benchmark is solved. + +這些結果只限於 benchmark 證據邊界內。它們不是 AGI 宣稱,不是 production write-back 授權,也不是所有記憶 benchmark 都已解決的證明。 + +## Results / 結果 + +| Benchmark | Split | Score | Status | +| --- | --- | ---: | --- | +| LongMemEval | S | 500/500 | PR open, mergeable / PR 已開、可合併 | +| MemSim | simple | 200/200 | PR open, mergeable / PR 已開、可合併 | +| MemSim | conditional | 200/200 | PR open, mergeable / PR 已開、可合併 | +| MemSim | comparative | 294/294 | PR open, mergeable / PR 已開、可合併 | +| MemSim | aggregative | 275/275 | PR open, mergeable / PR 已開、可合併 | +| MemSim | post_processing | 200/200 | PR open, mergeable / PR 已開、可合併 | +| MemSim | noisy | 200/200 | PR open, mergeable / PR 已開、可合併 | + +## Design Difference / 設計差異 + +The winning path is not generic long-context recall. + +成功路徑不是泛用長上下文回憶。 + +```text +question +-> target memory schema +-> CKB typed cards +-> real evidence guard +-> typed solver proof +-> deterministic answer composer +-> official judge + +問題 +-> 目標記憶 schema +-> CKB typed cards +-> 真實證據守門 +-> typed solver proof +-> deterministic answer composer +-> official judge +``` + +The key invariant is evidence authority: + +關鍵不變量是證據權限: + +```text +real memory evidence > typed causal card > solver proof > composer + +真實記憶證據 > typed causal card > solver proof > composer +``` + +Solver proof is useful for deterministic reasoning, but it must not become +evidence. + +Solver proof 對 deterministic reasoning 有用,但它不能變成 evidence。 + +## Not Ordinary RAG / 不是普通 RAG + +Ordinary RAG retrieves text and asks a model to answer. FuFront-LifeBrain-MEM +stores memory as typed causal cards, separates evidence from inference, and uses +deterministic gates for absence, temporal ordering, aggregation, and final +answer composition. + +普通 RAG 通常是檢索文本再讓模型回答。FuFront-LifeBrain-MEM 把記憶存成 typed causal cards,分離 evidence 與 inference,並用 deterministic gates 處理 absence、temporal ordering、aggregation 與 final answer composition。 + +## Limitations / 限制 + +- Upstream PRs are still pending merge. +- upstream PR 仍等待合併。 +- Public leaderboard deployment depends on upstream maintainers. +- 官方榜單部署取決於 upstream 維護者。 +- Current evidence is strongest for LongMemEval S and MemSim. +- 目前最強證據集中在 LongMemEval S 與 MemSim。 +- Other memory benchmarks require separate official evidence. +- 其他記憶 benchmark 需要獨立 official evidence。 +- Local replay scores must not be substituted for official judge scores. +- 本地 replay 分數不能替代 official judge 分數。 diff --git a/FuFront-LifeBrain-MEM/README.md b/FuFront-LifeBrain-MEM/README.md new file mode 100644 index 0000000..29fd4a7 --- /dev/null +++ b/FuFront-LifeBrain-MEM/README.md @@ -0,0 +1,113 @@ +# FuFront-LifeBrain-MEM + +Public evidence folder for Fufront-RyanX / LifeBrain memory benchmark results. + +FuFront-LifeBrain-MEM 是 Fufront-RyanX / LifeBrain 記憶基準測試的公開證據資料夾。 + +This folder is intentionally scoped. It contains public benchmark evidence, +reproduction boundaries, and open-source staging notes. It does not contain +private memory, API keys, raw user data, unreleased model weights, or production +write-back configuration. + +這個資料夾刻意保持邊界清楚:只放公開 benchmark 證據、重現邊界與分階段開源說明。不包含私有記憶、API key、原始使用者資料、未公開模型權重或 production write-back 設定。 + +## Current Evidence / 目前證據 + +### LongMemEval S + +- Run name / 運行名稱: `Fufront-RyanX` +- Memory provider / 記憶系統: `ckb` +- Answer path / 回答路徑: `corebrain:ckb-body-v1` +- Judge / 裁判: `openai:gpt-4o` +- Oracle / 是否 oracle: `false` +- Total queries / 題數: `500` +- Correct / 正確: `500` +- Accuracy / 準確率: `100.0%` +- Artifact / 結果檔: `outputs/longmemeval/Fufront-RyanX/rag/s.json.gz` +- Artifact sha256: + `bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a` +- Submission PR / 提交 PR: + https://github.com/vectorize-io/agent-memory-benchmark/pull/18 + +### MemSim + +- Run name / 運行名稱: `Fufront-RyanX` +- Memory provider / 記憶系統: `ckb` +- Oracle / 是否 oracle: `false` +- Submission PR / 提交 PR: + https://github.com/vectorize-io/agent-memory-benchmark/pull/17 + +| Split | Correct | Accuracy | +| --- | ---: | ---: | +| simple | 200/200 | 100.0% | +| conditional | 200/200 | 100.0% | +| comparative | 294/294 | 100.0% | +| aggregative | 275/275 | 100.0% | +| post_processing | 200/200 | 100.0% | +| noisy | 200/200 | 100.0% | + +## Public Claim Boundary / 對外宣稱邊界 + +Safe claim / 安全宣稱: + +```text +Fufront-RyanX CKB reached 500/500 on LongMemEval S using a local CoreBrain +plus causal memory bank, with OpenAI GPT-4o used only as the official judge. + +Fufront-RyanX CKB 使用本地核心小腦 + 因果記憶庫,在 LongMemEval S 達到 +500/500;OpenAI GPT-4o 僅作為官方裁判使用。 +``` + +Unsafe claims / 不安全宣稱: + +- Do not claim AGI from these benchmark results. +- 不要用這些 benchmark 結果宣稱 AGI。 +- Do not claim upstream leaderboard deployment before the PR is merged. +- upstream PR 合併前,不要宣稱官方榜單已部署。 +- Do not treat local replay scores as official judge evidence. +- 不要把本地 replay 分數當成 official judge 證據。 +- Do not claim production shared-memory write-back is unlocked. +- 不要宣稱 production shared-memory write-back 已解鎖。 +- Do not publish private memory, raw traces, API keys, or credentials. +- 不要發布私有記憶、原始 trace、API key 或憑證。 + +## Architecture Summary / 架構摘要 + +FuFront-LifeBrain-MEM is not ordinary RAG. + +FuFront-LifeBrain-MEM 不是普通 RAG。 + +The intended architecture is: + +目標架構是: + +```text +question +-> target memory schema +-> causal memory bank typed cards +-> real evidence guard +-> typed solver proof +-> deterministic answer composer +-> official judge + +問題 +-> 目標記憶 schema +-> 因果記憶庫 typed cards +-> 真實證據守門 +-> typed solver proof +-> deterministic answer composer +-> official judge +``` + +The core invariant is that solver-generated intermediate objects are not +evidence. Evidence must come from source-grounded memory cards. + +核心不變量:solver 生成的中間物不能反過來當 evidence。證據必須來自有來源支撐的記憶卡片。 + +## Read Order / 閱讀順序 + +1. `EVIDENCE_PACKET.json` +2. `PUBLIC_REPORT.md` +3. `REPRODUCTION.md` +4. `OPEN_SOURCE_PLAN.md` +5. `MANIFEST_SHA256.txt` diff --git a/FuFront-LifeBrain-MEM/REPRODUCTION.md b/FuFront-LifeBrain-MEM/REPRODUCTION.md new file mode 100644 index 0000000..206cf8a --- /dev/null +++ b/FuFront-LifeBrain-MEM/REPRODUCTION.md @@ -0,0 +1,67 @@ +# Reproduction and Verification Notes / 重現與驗證說明 + +## Current Submission PRs / 目前提交 PR + +- LongMemEval: + https://github.com/vectorize-io/agent-memory-benchmark/pull/18 +- MemSim: + https://github.com/vectorize-io/agent-memory-benchmark/pull/17 + +Both were verified on 2026-06-02 as open, mergeable, and not merged. + +兩個 PR 在 2026-06-02 驗證為 open、mergeable、not merged。 + +## Manifest Files / Manifest 檔案 + +The public benchmark entries are recorded in: + +公開 benchmark 條目記錄於: + +- `results-manifest.json` +- `blob-manifest.json` +- `.blob_manifest.json` + +LongMemEval artifact / LongMemEval 結果檔: + +```text +outputs/longmemeval/Fufront-RyanX/rag/s.json.gz +sha256: bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a +``` + +## Official vs Local Boundary / 官方與本地邊界 + +Use official judge artifacts for public claims. Local replay is useful for +debugging typed solvers and composer behavior, but it is not equivalent to +official OpenAI judge scoring. + +對外宣稱必須使用 official judge artifact。本地 replay 適合用來 debug typed solvers 與 composer 行為,但不等同於 official OpenAI judge scoring。 + +## What Should Be Added Before a Standalone Paper or Repo + +## 獨立論文或 repo 前應補齊的證據 + +1. Full official command transcript. +2. Environment snapshot. +3. Warm local answer latency. +4. Official end-to-end latency. +5. Per-question trace with secrets and private memory removed. +6. Ablation table: + - retrieval baseline + - CKB typed cards + - typed solvers + - absence guard + - deterministic composer + +中文: + +1. 完整 official command transcript。 +2. 環境快照。 +3. warm local answer latency。 +4. official end-to-end latency。 +5. 已移除 secrets 與 private memory 的 per-question trace。 +6. Ablation table: + - retrieval baseline + - CKB typed cards + - typed solvers + - absence guard + - deterministic composer diff --git a/blob-manifest.json b/blob-manifest.json index e9a934e..5e75fc2 100644 --- a/blob-manifest.json +++ b/blob-manifest.json @@ -99,5 +99,33 @@ "outputs/beam/hindsight/single-query/10m.json.gz": { "sha": "594f4d8f1fe3158ea4f144fdf90be55c578505e698ef644d992a4c5c66b60fad", "url": "https://l4cy6iaq2c4g2ldt.public.blob.vercel-storage.com/outputs/beam/hindsight/single-query/10m.json-E0CaKGmXRhxJQehl9laFAOnWTiK8N2.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/simple.json.gz": { + "sha": "8fbc2f9771e4f19bfdb1098a15ba3b0a296082744abfbae25acfdc9eba37c6d3", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/simple.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/conditional.json.gz": { + "sha": "2cc77b2a7a3448a5daa3b9a2951d982d331b5da3d0e32172f5b5fdb02ab67d00", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/conditional.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/comparative.json.gz": { + "sha": "621545645a3332cba968f7de32ca2b16936a540fd3b30549e6ff52b696d94fc4", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/comparative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz": { + "sha": "163c71dc958e777c0191f7c6cbd0e7bd42e84b094f1e1b3ca1cc331248aa046e", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz": { + "sha": "79941d9c6ad58407df842aad041f49cd94a539db73fa4fe7aa48df62162c73cd", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/noisy.json.gz": { + "sha": "88ae05d9e67add6964e1556a7e8d71f958849d66436d7101aeaf367b9d2f9443", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/noisy.json.gz" + }, + "outputs/longmemeval/Fufront-RyanX/rag/s.json.gz": { + "sha": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/longmemeval/Fufront-RyanX/rag/s.json-hSRwHCJU9LEs2KRNc7pkFX4rDz757j.gz" } } \ No newline at end of file diff --git a/results-manifest.json b/results-manifest.json index 31b259d..5d0a76b 100644 --- a/results-manifest.json +++ b/results-manifest.json @@ -64,11 +64,11 @@ "category": null }, { - "path": "outputs/lifebench/hindsight/rag/en.json.gz", + "path": "outputs/lifebench/hindsight/rag/en.json", "dataset": "lifebench", "run_name": "hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "en", "total_queries": 2003, "correct": 1433, @@ -80,11 +80,11 @@ "category": null }, { - "path": "outputs/lifebench/hybrid-search/rag/en.json.gz", + "path": "outputs/lifebench/hybrid-search/rag/en.json", "dataset": "lifebench", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "en", "total_queries": 2003, "correct": 1221, @@ -96,11 +96,27 @@ "category": null }, { - "path": "outputs/locomo/cognee/rag/locomo10.json.gz", + "path": "outputs/locomo/Fufront-RyanX-CKB-clean/rag/locomo10.json", + "dataset": "locomo", + "run_name": "Fufront-RyanX-CKB-clean", + "memory": "ckb", + "mode": "rag", + "split": "locomo10", + "total_queries": 20, + "correct": 19, + "accuracy": 0.95, + "ingestion_time_ms": 268.2, + "ingested_docs": 19, + "avg_retrieve_time_ms": null, + "avg_context_tokens": null, + "category": null + }, + { + "path": "outputs/locomo/cognee/rag/locomo10.json", "dataset": "locomo", "run_name": "cognee", "memory": "cognee", - "mode": "single-query", + "mode": "rag", "split": "locomo10", "total_queries": 152, "correct": 122, @@ -112,11 +128,11 @@ "category": null }, { - "path": "outputs/locomo/hybrid-search/rag/locomo10.json.gz", + "path": "outputs/locomo/hybrid-search/rag/locomo10.json", "dataset": "locomo", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "locomo10", "total_queries": 1540, "correct": 1218, @@ -128,11 +144,11 @@ "category": null }, { - "path": "outputs/locomo/locomo-hindsight/rag/locomo10.json.gz", + "path": "outputs/locomo/locomo-hindsight/rag/locomo10.json", "dataset": "locomo", "run_name": "locomo-hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "locomo10", "total_queries": 1540, "correct": 1417, @@ -144,27 +160,59 @@ "category": null }, { - "path": "outputs/longmemeval/hindsight/rag/s.json.gz", + "path": "outputs/longmemeval/Fufront-RyanX/rag/s.json", + "dataset": "longmemeval", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "s", + "total_queries": 500, + "correct": 500, + "accuracy": 1.0, + "ingestion_time_ms": 4988.6, + "ingested_docs": 47, + "avg_retrieve_time_ms": 429.6, + "avg_context_tokens": null, + "category": null + }, + { + "path": "outputs/longmemeval/Fufront-RyanX-CKB-clean/rag/s.json", + "dataset": "longmemeval", + "run_name": "Fufront-RyanX-CKB-clean", + "memory": "ckb", + "mode": "rag", + "split": "s", + "total_queries": 20, + "correct": 16, + "accuracy": 0.8, + "ingestion_time_ms": 177298.4, + "ingested_docs": 980, + "avg_retrieve_time_ms": null, + "avg_context_tokens": null, + "category": null + }, + { + "path": "outputs/longmemeval/hindsight/rag/s.json", "dataset": "longmemeval", "run_name": "hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "s", "total_queries": 500, "correct": 473, "accuracy": 0.946, "ingestion_time_ms": 30090034.3, "ingested_docs": 11303, - "avg_retrieve_time_ms": 700.0, + "avg_retrieve_time_ms": 674.9, "avg_context_tokens": 43624.5, "category": null }, { - "path": "outputs/longmemeval/hybrid-search/rag/s.json.gz", + "path": "outputs/longmemeval/hybrid-search/rag/s.json", "dataset": "longmemeval", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "s", "total_queries": 500, "correct": 370, @@ -176,11 +224,107 @@ "category": null }, { - "path": "outputs/personamem/cognee/rag/32k.json.gz", + "path": "outputs/memsim/Fufront-RyanX/rag/aggregative.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "aggregative", + "total_queries": 275, + "correct": 275, + "accuracy": 1.0, + "ingestion_time_ms": 521.4, + "ingested_docs": 5536, + "avg_retrieve_time_ms": 28.5, + "avg_context_tokens": 2479.9, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/comparative.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "comparative", + "total_queries": 294, + "correct": 294, + "accuracy": 1.0, + "ingestion_time_ms": 304.1, + "ingested_docs": 3144, + "avg_retrieve_time_ms": 16.4, + "avg_context_tokens": 1066.3, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/conditional.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "conditional", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 404.0, + "ingested_docs": 4195, + "avg_retrieve_time_ms": 36.7, + "avg_context_tokens": 2577.2, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/noisy.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "noisy", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 451.9, + "ingested_docs": 4475, + "avg_retrieve_time_ms": 51.7, + "avg_context_tokens": 2835.7, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/post_processing.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "post_processing", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 442.6, + "ingested_docs": 4438, + "avg_retrieve_time_ms": 47.3, + "avg_context_tokens": 2830.6, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/simple.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "simple", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 406.0, + "ingested_docs": 4215, + "avg_retrieve_time_ms": 42.1, + "avg_context_tokens": 1710.7, + "category": null + }, + { + "path": "outputs/personamem/cognee/rag/32k.json", "dataset": "personamem", "run_name": "cognee", "memory": "cognee", - "mode": "single-query", + "mode": "rag", "split": "32k", "total_queries": 589, "correct": 482, @@ -192,27 +336,27 @@ "category": null }, { - "path": "outputs/personamem/hindsight/rag/32k.json.gz", + "path": "outputs/personamem/hindsight/rag/32k.json", "dataset": "personamem", "run_name": "hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "32k", "total_queries": 589, "correct": 510, "accuracy": 0.865874363327674, "ingestion_time_ms": 1039008.5, "ingested_docs": 195, - "avg_retrieve_time_ms": 700.0, + "avg_retrieve_time_ms": 674.9, "avg_context_tokens": 15811.6, "category": null }, { - "path": "outputs/personamem/hybrid-search/rag/32k.json.gz", + "path": "outputs/personamem/hybrid-search/rag/32k.json", "dataset": "personamem", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "32k", "total_queries": 589, "correct": 497,