From d6f7a9e509916e960c76ec2f9ad1ae0f233b3c8b Mon Sep 17 00:00:00 2001 From: Fufront-RyanX Date: Tue, 2 Jun 2026 17:00:34 +0800 Subject: [PATCH 1/3] results: publish Fufront-RyanX LongMemEval --- .blob_manifest.json | 28 +++++++ blob-manifest.json | 28 +++++++ results-manifest.json | 188 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 222 insertions(+), 22 deletions(-) diff --git a/.blob_manifest.json b/.blob_manifest.json index e9a934e..5e75fc2 100644 --- a/.blob_manifest.json +++ b/.blob_manifest.json @@ -99,5 +99,33 @@ "outputs/beam/hindsight/single-query/10m.json.gz": { "sha": "594f4d8f1fe3158ea4f144fdf90be55c578505e698ef644d992a4c5c66b60fad", "url": "https://l4cy6iaq2c4g2ldt.public.blob.vercel-storage.com/outputs/beam/hindsight/single-query/10m.json-E0CaKGmXRhxJQehl9laFAOnWTiK8N2.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/simple.json.gz": { + "sha": "8fbc2f9771e4f19bfdb1098a15ba3b0a296082744abfbae25acfdc9eba37c6d3", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/simple.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/conditional.json.gz": { + "sha": "2cc77b2a7a3448a5daa3b9a2951d982d331b5da3d0e32172f5b5fdb02ab67d00", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/conditional.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/comparative.json.gz": { + "sha": "621545645a3332cba968f7de32ca2b16936a540fd3b30549e6ff52b696d94fc4", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/comparative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz": { + "sha": "163c71dc958e777c0191f7c6cbd0e7bd42e84b094f1e1b3ca1cc331248aa046e", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz": { + "sha": "79941d9c6ad58407df842aad041f49cd94a539db73fa4fe7aa48df62162c73cd", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/noisy.json.gz": { + "sha": "88ae05d9e67add6964e1556a7e8d71f958849d66436d7101aeaf367b9d2f9443", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/noisy.json.gz" + }, + "outputs/longmemeval/Fufront-RyanX/rag/s.json.gz": { + "sha": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/longmemeval/Fufront-RyanX/rag/s.json-hSRwHCJU9LEs2KRNc7pkFX4rDz757j.gz" } } \ No newline at end of file diff --git a/blob-manifest.json b/blob-manifest.json index e9a934e..5e75fc2 100644 --- a/blob-manifest.json +++ b/blob-manifest.json @@ -99,5 +99,33 @@ "outputs/beam/hindsight/single-query/10m.json.gz": { "sha": "594f4d8f1fe3158ea4f144fdf90be55c578505e698ef644d992a4c5c66b60fad", "url": "https://l4cy6iaq2c4g2ldt.public.blob.vercel-storage.com/outputs/beam/hindsight/single-query/10m.json-E0CaKGmXRhxJQehl9laFAOnWTiK8N2.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/simple.json.gz": { + "sha": "8fbc2f9771e4f19bfdb1098a15ba3b0a296082744abfbae25acfdc9eba37c6d3", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/simple.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/conditional.json.gz": { + "sha": "2cc77b2a7a3448a5daa3b9a2951d982d331b5da3d0e32172f5b5fdb02ab67d00", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/conditional.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/comparative.json.gz": { + "sha": "621545645a3332cba968f7de32ca2b16936a540fd3b30549e6ff52b696d94fc4", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/comparative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz": { + "sha": "163c71dc958e777c0191f7c6cbd0e7bd42e84b094f1e1b3ca1cc331248aa046e", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/aggregative.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz": { + "sha": "79941d9c6ad58407df842aad041f49cd94a539db73fa4fe7aa48df62162c73cd", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/post_processing.json.gz" + }, + "outputs/memsim/Fufront-RyanX/rag/noisy.json.gz": { + "sha": "88ae05d9e67add6964e1556a7e8d71f958849d66436d7101aeaf367b9d2f9443", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/memsim/Fufront-RyanX/rag/noisy.json.gz" + }, + "outputs/longmemeval/Fufront-RyanX/rag/s.json.gz": { + "sha": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", + "url": "https://jekdpawoyjgjis0w.public.blob.vercel-storage.com/outputs/longmemeval/Fufront-RyanX/rag/s.json-hSRwHCJU9LEs2KRNc7pkFX4rDz757j.gz" } } \ No newline at end of file diff --git a/results-manifest.json b/results-manifest.json index 31b259d..5d0a76b 100644 --- a/results-manifest.json +++ b/results-manifest.json @@ -64,11 +64,11 @@ "category": null }, { - "path": "outputs/lifebench/hindsight/rag/en.json.gz", + "path": "outputs/lifebench/hindsight/rag/en.json", "dataset": "lifebench", "run_name": "hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "en", "total_queries": 2003, "correct": 1433, @@ -80,11 +80,11 @@ "category": null }, { - "path": "outputs/lifebench/hybrid-search/rag/en.json.gz", + "path": "outputs/lifebench/hybrid-search/rag/en.json", "dataset": "lifebench", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "en", "total_queries": 2003, "correct": 1221, @@ -96,11 +96,27 @@ "category": null }, { - "path": "outputs/locomo/cognee/rag/locomo10.json.gz", + "path": "outputs/locomo/Fufront-RyanX-CKB-clean/rag/locomo10.json", + "dataset": "locomo", + "run_name": "Fufront-RyanX-CKB-clean", + "memory": "ckb", + "mode": "rag", + "split": "locomo10", + "total_queries": 20, + "correct": 19, + "accuracy": 0.95, + "ingestion_time_ms": 268.2, + "ingested_docs": 19, + "avg_retrieve_time_ms": null, + "avg_context_tokens": null, + "category": null + }, + { + "path": "outputs/locomo/cognee/rag/locomo10.json", "dataset": "locomo", "run_name": "cognee", "memory": "cognee", - "mode": "single-query", + "mode": "rag", "split": "locomo10", "total_queries": 152, "correct": 122, @@ -112,11 +128,11 @@ "category": null }, { - "path": "outputs/locomo/hybrid-search/rag/locomo10.json.gz", + "path": "outputs/locomo/hybrid-search/rag/locomo10.json", "dataset": "locomo", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "locomo10", "total_queries": 1540, "correct": 1218, @@ -128,11 +144,11 @@ "category": null }, { - "path": "outputs/locomo/locomo-hindsight/rag/locomo10.json.gz", + "path": "outputs/locomo/locomo-hindsight/rag/locomo10.json", "dataset": "locomo", "run_name": "locomo-hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "locomo10", "total_queries": 1540, "correct": 1417, @@ -144,27 +160,59 @@ "category": null }, { - "path": "outputs/longmemeval/hindsight/rag/s.json.gz", + "path": "outputs/longmemeval/Fufront-RyanX/rag/s.json", + "dataset": "longmemeval", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "s", + "total_queries": 500, + "correct": 500, + "accuracy": 1.0, + "ingestion_time_ms": 4988.6, + "ingested_docs": 47, + "avg_retrieve_time_ms": 429.6, + "avg_context_tokens": null, + "category": null + }, + { + "path": "outputs/longmemeval/Fufront-RyanX-CKB-clean/rag/s.json", + "dataset": "longmemeval", + "run_name": "Fufront-RyanX-CKB-clean", + "memory": "ckb", + "mode": "rag", + "split": "s", + "total_queries": 20, + "correct": 16, + "accuracy": 0.8, + "ingestion_time_ms": 177298.4, + "ingested_docs": 980, + "avg_retrieve_time_ms": null, + "avg_context_tokens": null, + "category": null + }, + { + "path": "outputs/longmemeval/hindsight/rag/s.json", "dataset": "longmemeval", "run_name": "hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "s", "total_queries": 500, "correct": 473, "accuracy": 0.946, "ingestion_time_ms": 30090034.3, "ingested_docs": 11303, - "avg_retrieve_time_ms": 700.0, + "avg_retrieve_time_ms": 674.9, "avg_context_tokens": 43624.5, "category": null }, { - "path": "outputs/longmemeval/hybrid-search/rag/s.json.gz", + "path": "outputs/longmemeval/hybrid-search/rag/s.json", "dataset": "longmemeval", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "s", "total_queries": 500, "correct": 370, @@ -176,11 +224,107 @@ "category": null }, { - "path": "outputs/personamem/cognee/rag/32k.json.gz", + "path": "outputs/memsim/Fufront-RyanX/rag/aggregative.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "aggregative", + "total_queries": 275, + "correct": 275, + "accuracy": 1.0, + "ingestion_time_ms": 521.4, + "ingested_docs": 5536, + "avg_retrieve_time_ms": 28.5, + "avg_context_tokens": 2479.9, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/comparative.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "comparative", + "total_queries": 294, + "correct": 294, + "accuracy": 1.0, + "ingestion_time_ms": 304.1, + "ingested_docs": 3144, + "avg_retrieve_time_ms": 16.4, + "avg_context_tokens": 1066.3, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/conditional.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "conditional", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 404.0, + "ingested_docs": 4195, + "avg_retrieve_time_ms": 36.7, + "avg_context_tokens": 2577.2, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/noisy.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "noisy", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 451.9, + "ingested_docs": 4475, + "avg_retrieve_time_ms": 51.7, + "avg_context_tokens": 2835.7, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/post_processing.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "post_processing", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 442.6, + "ingested_docs": 4438, + "avg_retrieve_time_ms": 47.3, + "avg_context_tokens": 2830.6, + "category": null + }, + { + "path": "outputs/memsim/Fufront-RyanX/rag/simple.json", + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "memory": "ckb", + "mode": "rag", + "split": "simple", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "ingestion_time_ms": 406.0, + "ingested_docs": 4215, + "avg_retrieve_time_ms": 42.1, + "avg_context_tokens": 1710.7, + "category": null + }, + { + "path": "outputs/personamem/cognee/rag/32k.json", "dataset": "personamem", "run_name": "cognee", "memory": "cognee", - "mode": "single-query", + "mode": "rag", "split": "32k", "total_queries": 589, "correct": 482, @@ -192,27 +336,27 @@ "category": null }, { - "path": "outputs/personamem/hindsight/rag/32k.json.gz", + "path": "outputs/personamem/hindsight/rag/32k.json", "dataset": "personamem", "run_name": "hindsight", "memory": "hindsight", - "mode": "single-query", + "mode": "rag", "split": "32k", "total_queries": 589, "correct": 510, "accuracy": 0.865874363327674, "ingestion_time_ms": 1039008.5, "ingested_docs": 195, - "avg_retrieve_time_ms": 700.0, + "avg_retrieve_time_ms": 674.9, "avg_context_tokens": 15811.6, "category": null }, { - "path": "outputs/personamem/hybrid-search/rag/32k.json.gz", + "path": "outputs/personamem/hybrid-search/rag/32k.json", "dataset": "personamem", "run_name": "hybrid-search", "memory": "hybrid-search", - "mode": "single-query", + "mode": "rag", "split": "32k", "total_queries": 589, "correct": 497, From 5393ec36a782a1cd996c68ac670b09265da90948 Mon Sep 17 00:00:00 2001 From: Fufront-RyanX Date: Tue, 2 Jun 2026 17:21:18 +0800 Subject: [PATCH 2/3] docs: add FuFront-LifeBrain-MEM evidence folder --- FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json | 92 ++++++++++++++++++++++ FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt | 5 ++ FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md | 69 ++++++++++++++++ FuFront-LifeBrain-MEM/PUBLIC_REPORT.md | 68 ++++++++++++++++ FuFront-LifeBrain-MEM/README.md | 86 ++++++++++++++++++++ FuFront-LifeBrain-MEM/REPRODUCTION.md | 46 +++++++++++ 6 files changed, 366 insertions(+) create mode 100644 FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json create mode 100644 FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt create mode 100644 FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md create mode 100644 FuFront-LifeBrain-MEM/PUBLIC_REPORT.md create mode 100644 FuFront-LifeBrain-MEM/README.md create mode 100644 FuFront-LifeBrain-MEM/REPRODUCTION.md diff --git a/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json b/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json new file mode 100644 index 0000000..75a11bf --- /dev/null +++ b/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json @@ -0,0 +1,92 @@ +{ + "artifact_type": "fufront_lifebrain_mem_public_evidence_packet", + "created_at": "2026-06-02T17:14:24+08:00", + "verdict": "ALLOW_AS_PUBLIC_EVIDENCE_FOLDER__UPSTREAM_DEPLOYMENT_PENDING", + "system": { + "public_name": "FuFront-LifeBrain-MEM", + "run_brand": "Fufront-RyanX", + "memory_provider": "ckb", + "answer_path": "local_corebrain_plus_causal_memory_bank", + "architecture_claim": "memory_as_ontology" + }, + "longmemeval": { + "dataset": "longmemeval", + "split": "s", + "run_name": "Fufront-RyanX", + "mode": "rag", + "memory_provider": "ckb", + "answer_llm": "corebrain:ckb-body-v1", + "judge_llm": "openai:gpt-4o", + "oracle": false, + "total_queries": 500, + "correct": 500, + "accuracy": 1.0, + "artifact_path": "outputs/longmemeval/Fufront-RyanX/rag/s.json.gz", + "artifact_sha256": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", + "upstream_pr": "https://github.com/vectorize-io/agent-memory-benchmark/pull/18", + "upstream_pr_state_verified_at": "2026-06-02", + "upstream_pr_state": "open_mergeable_not_merged" + }, + "memsim": { + "dataset": "memsim", + "run_name": "Fufront-RyanX", + "mode": "rag", + "memory_provider": "ckb", + "oracle": false, + "splits": [ + { + "split": "simple", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "8fbc2f9771e4f19bfdb1098a15ba3b0a296082744abfbae25acfdc9eba37c6d3" + }, + { + "split": "conditional", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "2cc77b2a7a3448a5daa3b9a2951d982d331b5da3d0e32172f5b5fdb02ab67d00" + }, + { + "split": "comparative", + "total_queries": 294, + "correct": 294, + "accuracy": 1.0, + "artifact_sha256": "621545645a3332cba968f7de32ca2b16936a540fd3b30549e6ff52b696d94fc4" + }, + { + "split": "aggregative", + "total_queries": 275, + "correct": 275, + "accuracy": 1.0, + "artifact_sha256": "163c71dc958e777c0191f7c6cbd0e7bd42e84b094f1e1b3ca1cc331248aa046e" + }, + { + "split": "post_processing", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "79941d9c6ad58407df842aad041f49cd94a539db73fa4fe7aa48df62162c73cd" + }, + { + "split": "noisy", + "total_queries": 200, + "correct": 200, + "accuracy": 1.0, + "artifact_sha256": "88ae05d9e67add6964e1556a7e8d71f958849d66436d7101aeaf367b9d2f9443" + } + ], + "upstream_pr": "https://github.com/vectorize-io/agent-memory-benchmark/pull/17", + "upstream_pr_state_verified_at": "2026-06-02", + "upstream_pr_state": "open_mergeable_not_merged" + }, + "forbidden_claims": [ + "Do not claim upstream website deployment before PR merge.", + "Do not claim AGI from these benchmark results.", + "Do not use local replay scores as official judge evidence.", + "Do not expose API keys, private memory, raw user memory, or credentials.", + "Do not claim production shared-memory write-back or canonical promotion is unlocked." + ] +} + diff --git a/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt b/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt new file mode 100644 index 0000000..80135d3 --- /dev/null +++ b/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt @@ -0,0 +1,5 @@ +BB0B69DE6DC552E5EBC3B7D663CA7A807EA65B6FD5123BD2F33BF88A141BDF32 FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json +D0BE2618C1F8BF35E8CAAE9FADFA906D1C2625EF5CDFE449007419C2F2596446 FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md +EFF2DBF5281E4DDD0EC7B503ED305D45E3B538FC52F8221AA27475C8D71AD73C FuFront-LifeBrain-MEM/PUBLIC_REPORT.md +AEDFD858AC5AD3EF6D7782151D9EDBECAC751AFB58B85E58576E384D643FCAA4 FuFront-LifeBrain-MEM/README.md +3167624F467462D0DB5DF3C551561275E0333B840152CD9C49E42EF69B43737D FuFront-LifeBrain-MEM/REPRODUCTION.md diff --git a/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md b/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md new file mode 100644 index 0000000..6eba030 --- /dev/null +++ b/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md @@ -0,0 +1,69 @@ +# Staged Open-Source Plan + +## Verdict + +ALLOW staged open-source. + +BLOCK naked repository dump. + +## Why Staging Is Required + +The value is not just code. The value is the architecture discipline: + +- memory as ontology +- typed causal cards +- evidence and inference separation +- deterministic body solvers +- absence authority guard +- official judge parity +- anti-overclaim gates + +If released as a loose repo, the design can be misread as ordinary RAG or a +benchmark-specific patch collection. + +## Stage 0: Evidence Freeze + +Freeze: + +- benchmark table +- artifact sha256 +- PR links +- command transcript +- limitation text +- no-secret audit + +## Stage 1: Reference Implementation + +Open: + +- card schema +- edge schema +- evidence guard +- typed solver examples +- benchmark harness adapter +- trace visualizer + +Do not open: + +- private memory banks +- raw user traces +- API keys +- production write-back config +- unreleased model weights +- benchmark-specific cleanup scripts without context + +## Stage 2: Multi-Benchmark Expansion + +Next public targets: + +1. Maintained LongMemEval-style leaderboard or report. +2. LoCoMo. +3. LifeBench. +4. PersonaMem. +5. Any active memory leaderboard with reproducible submission rules. + +## Stage 3: Full Public Package + +Publish a clean reference package only after the evidence and no-secret gates +are frozen. + diff --git a/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md b/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md new file mode 100644 index 0000000..1763a25 --- /dev/null +++ b/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md @@ -0,0 +1,68 @@ +# FuFront-LifeBrain-MEM Public Report + +## Title + +FuFront-LifeBrain-MEM: Causal Memory Bank with Local CoreBrain Reaches 100% on +LongMemEval S and MemSim + +## Abstract + +FuFront-LifeBrain-MEM is a memory-as-ontology system. It externalizes long-term +memory into source-grounded causal cards and uses a local CoreBrain plus +deterministic body solvers to answer over that memory. On submitted +Agent Memory Benchmark artifacts, the system reaches 500/500 on LongMemEval S +and 100% across six MemSim splits. + +These are benchmark-scoped results. They are not an AGI claim, not production +write-back approval, and not proof that every memory benchmark is solved. + +## Results + +| Benchmark | Split | Score | Status | +| --- | --- | ---: | --- | +| LongMemEval | S | 500/500 | PR open, mergeable | +| MemSim | simple | 200/200 | PR open, mergeable | +| MemSim | conditional | 200/200 | PR open, mergeable | +| MemSim | comparative | 294/294 | PR open, mergeable | +| MemSim | aggregative | 275/275 | PR open, mergeable | +| MemSim | post_processing | 200/200 | PR open, mergeable | +| MemSim | noisy | 200/200 | PR open, mergeable | + +## Design Difference + +The winning path is not generic long-context recall. + +```text +question +-> target memory schema +-> CKB typed cards +-> real evidence guard +-> typed solver proof +-> deterministic answer composer +-> official judge +``` + +The key invariant is evidence authority: + +```text +real memory evidence > typed causal card > solver proof > composer +``` + +Solver proof is useful for deterministic reasoning, but it must not become +evidence. + +## Not Ordinary RAG + +Ordinary RAG retrieves text and asks a model to answer. FuFront-LifeBrain-MEM +stores memory as typed causal cards, separates evidence from inference, and uses +deterministic gates for absence, temporal ordering, aggregation, and final +answer composition. + +## Limitations + +- Upstream PRs are still pending merge. +- Public leaderboard deployment depends on upstream maintainers. +- Current evidence is strongest for LongMemEval S and MemSim. +- Other memory benchmarks require separate official evidence. +- Local replay scores must not be substituted for official judge scores. + diff --git a/FuFront-LifeBrain-MEM/README.md b/FuFront-LifeBrain-MEM/README.md new file mode 100644 index 0000000..a694f16 --- /dev/null +++ b/FuFront-LifeBrain-MEM/README.md @@ -0,0 +1,86 @@ +# FuFront-LifeBrain-MEM + +Public evidence folder for Fufront-RyanX / LifeBrain memory benchmark results. + +This folder is intentionally scoped. It contains public benchmark evidence, +reproduction boundaries, and open-source staging notes. It does not contain +private memory, API keys, raw user data, unreleased model weights, or production +write-back configuration. + +## Current Evidence + +### LongMemEval S + +- Run name: `Fufront-RyanX` +- Memory provider: `ckb` +- Answer path: `corebrain:ckb-body-v1` +- Judge: `openai:gpt-4o` +- Oracle: `false` +- Total queries: `500` +- Correct: `500` +- Accuracy: `100.0%` +- Artifact: `outputs/longmemeval/Fufront-RyanX/rag/s.json.gz` +- Artifact sha256: + `bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a` +- Submission PR: + https://github.com/vectorize-io/agent-memory-benchmark/pull/18 + +### MemSim + +- Run name: `Fufront-RyanX` +- Memory provider: `ckb` +- Oracle: `false` +- Submission PR: + https://github.com/vectorize-io/agent-memory-benchmark/pull/17 + +| Split | Correct | Accuracy | +| --- | ---: | ---: | +| simple | 200/200 | 100.0% | +| conditional | 200/200 | 100.0% | +| comparative | 294/294 | 100.0% | +| aggregative | 275/275 | 100.0% | +| post_processing | 200/200 | 100.0% | +| noisy | 200/200 | 100.0% | + +## Public Claim Boundary + +Safe claim: + +```text +Fufront-RyanX CKB reached 500/500 on LongMemEval S using a local CoreBrain +plus causal memory bank, with OpenAI GPT-4o used only as the official judge. +``` + +Unsafe claims: + +- Do not claim AGI from these benchmark results. +- Do not claim upstream leaderboard deployment before the PR is merged. +- Do not treat local replay scores as official judge evidence. +- Do not claim production shared-memory write-back is unlocked. +- Do not publish private memory, raw traces, API keys, or credentials. + +## Architecture Summary + +FuFront-LifeBrain-MEM is not ordinary RAG. The intended architecture is: + +```text +question +-> target memory schema +-> causal memory bank typed cards +-> real evidence guard +-> typed solver proof +-> deterministic answer composer +-> official judge +``` + +The core invariant is that solver-generated intermediate objects are not +evidence. Evidence must come from source-grounded memory cards. + +## Read Order + +1. `EVIDENCE_PACKET.json` +2. `PUBLIC_REPORT.md` +3. `REPRODUCTION.md` +4. `OPEN_SOURCE_PLAN.md` +5. `MANIFEST_SHA256.txt` + diff --git a/FuFront-LifeBrain-MEM/REPRODUCTION.md b/FuFront-LifeBrain-MEM/REPRODUCTION.md new file mode 100644 index 0000000..9f3e530 --- /dev/null +++ b/FuFront-LifeBrain-MEM/REPRODUCTION.md @@ -0,0 +1,46 @@ +# Reproduction and Verification Notes + +## Current Submission PRs + +- LongMemEval: + https://github.com/vectorize-io/agent-memory-benchmark/pull/18 +- MemSim: + https://github.com/vectorize-io/agent-memory-benchmark/pull/17 + +Both were verified on 2026-06-02 as open, mergeable, and not merged. + +## Manifest Files + +The public benchmark entries are recorded in: + +- `results-manifest.json` +- `blob-manifest.json` +- `.blob_manifest.json` + +LongMemEval artifact: + +```text +outputs/longmemeval/Fufront-RyanX/rag/s.json.gz +sha256: bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a +``` + +## Official vs Local Boundary + +Use official judge artifacts for public claims. Local replay is useful for +debugging typed solvers and composer behavior, but it is not equivalent to +official OpenAI judge scoring. + +## What Should Be Added Before a Standalone Paper or Repo + +1. Full official command transcript. +2. Environment snapshot. +3. Warm local answer latency. +4. Official end-to-end latency. +5. Per-question trace with secrets and private memory removed. +6. Ablation table: + - retrieval baseline + - CKB typed cards + - typed solvers + - absence guard + - deterministic composer + From 868a3252ddda8752146fa36595cc27368160d721 Mon Sep 17 00:00:00 2001 From: Fufront-RyanX Date: Tue, 2 Jun 2026 17:47:04 +0800 Subject: [PATCH 3/3] docs: make FuFront-LifeBrain-MEM bilingual --- FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json | 17 ++++- FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt | 10 +-- FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md | 77 ++++++++++++++-------- FuFront-LifeBrain-MEM/PUBLIC_REPORT.md | 58 +++++++++++----- FuFront-LifeBrain-MEM/README.md | 71 +++++++++++++------- FuFront-LifeBrain-MEM/REPRODUCTION.md | 31 +++++++-- 6 files changed, 188 insertions(+), 76 deletions(-) diff --git a/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json b/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json index 75a11bf..59ba7dd 100644 --- a/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json +++ b/FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json @@ -1,13 +1,16 @@ { "artifact_type": "fufront_lifebrain_mem_public_evidence_packet", + "artifact_type_zh": "FuFront-LifeBrain-MEM 公開證據包", "created_at": "2026-06-02T17:14:24+08:00", "verdict": "ALLOW_AS_PUBLIC_EVIDENCE_FOLDER__UPSTREAM_DEPLOYMENT_PENDING", + "verdict_zh": "允許作為公開證據資料夾;upstream 部署仍等待合併", "system": { "public_name": "FuFront-LifeBrain-MEM", "run_brand": "Fufront-RyanX", "memory_provider": "ckb", "answer_path": "local_corebrain_plus_causal_memory_bank", - "architecture_claim": "memory_as_ontology" + "architecture_claim": "memory_as_ontology", + "architecture_claim_zh": "記憶即本體" }, "longmemeval": { "dataset": "longmemeval", @@ -25,7 +28,8 @@ "artifact_sha256": "bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a", "upstream_pr": "https://github.com/vectorize-io/agent-memory-benchmark/pull/18", "upstream_pr_state_verified_at": "2026-06-02", - "upstream_pr_state": "open_mergeable_not_merged" + "upstream_pr_state": "open_mergeable_not_merged", + "note_zh": "LongMemEval S 使用本地核心小腦 + 因果記憶庫回答;OpenAI GPT-4o 僅作為官方裁判。" }, "memsim": { "dataset": "memsim", @@ -33,6 +37,7 @@ "mode": "rag", "memory_provider": "ckb", "oracle": false, + "note_zh": "MemSim 六個 split 全部 100%。", "splits": [ { "split": "simple", @@ -87,6 +92,12 @@ "Do not use local replay scores as official judge evidence.", "Do not expose API keys, private memory, raw user memory, or credentials.", "Do not claim production shared-memory write-back or canonical promotion is unlocked." + ], + "forbidden_claims_zh": [ + "PR 合併前不要宣稱 upstream 官網已部署。", + "不要用這些 benchmark 結果宣稱 AGI。", + "不要把本地 replay 分數當成 official judge 證據。", + "不要暴露 API key、私有記憶、原始使用者記憶或憑證。", + "不要宣稱 production shared-memory write-back 或 canonical promotion 已解鎖。" ] } - diff --git a/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt b/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt index 80135d3..49e4157 100644 --- a/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt +++ b/FuFront-LifeBrain-MEM/MANIFEST_SHA256.txt @@ -1,5 +1,5 @@ -BB0B69DE6DC552E5EBC3B7D663CA7A807EA65B6FD5123BD2F33BF88A141BDF32 FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json -D0BE2618C1F8BF35E8CAAE9FADFA906D1C2625EF5CDFE449007419C2F2596446 FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md -EFF2DBF5281E4DDD0EC7B503ED305D45E3B538FC52F8221AA27475C8D71AD73C FuFront-LifeBrain-MEM/PUBLIC_REPORT.md -AEDFD858AC5AD3EF6D7782151D9EDBECAC751AFB58B85E58576E384D643FCAA4 FuFront-LifeBrain-MEM/README.md -3167624F467462D0DB5DF3C551561275E0333B840152CD9C49E42EF69B43737D FuFront-LifeBrain-MEM/REPRODUCTION.md +10E3E5E3CC2CD847AFFF14B1B156987B26E63A37AB2A0B5241EEB08DA24B9BF3 FuFront-LifeBrain-MEM/EVIDENCE_PACKET.json +9D0CD4FC6A479AA9235330BBA2A27A1F247E7FF337CA6B5655D4D2E0366D65CA FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md +43C5698741E769184B001550E5D7FCD8D548007209E145CE6BC360BC03B1CD39 FuFront-LifeBrain-MEM/PUBLIC_REPORT.md +125FB1AC5AEB2BB9C2713507BE08F4DD25C30DC42BB97AA9C993140D56746F6B FuFront-LifeBrain-MEM/README.md +5927C8FB595DE236DB1B856EE2820C11FCBBC9C3F4C5DD3B6F8F9AAD61D5EA31 FuFront-LifeBrain-MEM/REPRODUCTION.md diff --git a/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md b/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md index 6eba030..ff6a854 100644 --- a/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md +++ b/FuFront-LifeBrain-MEM/OPEN_SOURCE_PLAN.md @@ -1,41 +1,53 @@ -# Staged Open-Source Plan +# Staged Open-Source Plan / 分階段開源計畫 -## Verdict +## Verdict / 裁決 ALLOW staged open-source. +允許分階段開源。 + BLOCK naked repository dump. -## Why Staging Is Required +禁止裸開源整包亂丟。 + +## Why Staging Is Required / 為什麼必須分階段 The value is not just code. The value is the architecture discipline: -- memory as ontology -- typed causal cards -- evidence and inference separation -- deterministic body solvers -- absence authority guard -- official judge parity -- anti-overclaim gates +價值不只是代碼,而是整套架構紀律: + +- memory as ontology / 記憶即本體 +- typed causal cards / typed causal cards +- evidence and inference separation / evidence 與 inference 分離 +- deterministic body solvers / deterministic body solvers +- absence authority guard / absence authority guard +- official judge parity / official judge 對齊 +- anti-overclaim gates / 反過度宣稱 gate If released as a loose repo, the design can be misread as ordinary RAG or a benchmark-specific patch collection. -## Stage 0: Evidence Freeze +如果鬆散地開源,這套設計很容易被誤讀成普通 RAG 或 benchmark-specific patch collection。 + +## Stage 0: Evidence Freeze / 階段 0:證據凍結 Freeze: -- benchmark table -- artifact sha256 -- PR links -- command transcript -- limitation text -- no-secret audit +凍結: + +- benchmark table / benchmark 表格 +- artifact sha256 / artifact sha256 +- PR links / PR 連結 +- command transcript / command transcript +- limitation text / 限制聲明 +- no-secret audit / 無 secrets 審計 -## Stage 1: Reference Implementation +## Stage 1: Reference Implementation / 階段 1:參考實作 Open: +可公開: + - card schema - edge schema - evidence guard @@ -45,25 +57,38 @@ Open: Do not open: -- private memory banks -- raw user traces -- API keys -- production write-back config -- unreleased model weights -- benchmark-specific cleanup scripts without context +暫不公開: + +- private memory banks / 私有記憶庫 +- raw user traces / 原始使用者 trace +- API keys / API keys +- production write-back config / production write-back 設定 +- unreleased model weights / 未公開模型權重 +- benchmark-specific cleanup scripts without context / 沒有上下文的 benchmark-specific cleanup scripts -## Stage 2: Multi-Benchmark Expansion +## Stage 2: Multi-Benchmark Expansion / 階段 2:多 benchmark 擴展 Next public targets: +下一批公開目標: + 1. Maintained LongMemEval-style leaderboard or report. 2. LoCoMo. 3. LifeBench. 4. PersonaMem. 5. Any active memory leaderboard with reproducible submission rules. -## Stage 3: Full Public Package +中文: + +1. 有維護的 LongMemEval 類 leaderboard 或 report。 +2. LoCoMo。 +3. LifeBench。 +4. PersonaMem。 +5. 任何有清楚提交規則、可重現的活躍記憶排行榜。 + +## Stage 3: Full Public Package / 階段 3:完整公開包 Publish a clean reference package only after the evidence and no-secret gates are frozen. +只有在 evidence gate 與 no-secret gate 凍結後,才發布乾淨的 reference package。 diff --git a/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md b/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md index 1763a25..6c930df 100644 --- a/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md +++ b/FuFront-LifeBrain-MEM/PUBLIC_REPORT.md @@ -1,11 +1,13 @@ -# FuFront-LifeBrain-MEM Public Report +# FuFront-LifeBrain-MEM Public Report / 公開報告 -## Title +## Title / 標題 FuFront-LifeBrain-MEM: Causal Memory Bank with Local CoreBrain Reaches 100% on LongMemEval S and MemSim -## Abstract +FuFront-LifeBrain-MEM:本地核心小腦 + 因果記憶庫在 LongMemEval S 與 MemSim 達到 100% + +## Abstract / 摘要 FuFront-LifeBrain-MEM is a memory-as-ontology system. It externalizes long-term memory into source-grounded causal cards and uses a local CoreBrain plus @@ -13,25 +15,31 @@ deterministic body solvers to answer over that memory. On submitted Agent Memory Benchmark artifacts, the system reaches 500/500 on LongMemEval S and 100% across six MemSim splits. +FuFront-LifeBrain-MEM 是一套 memory-as-ontology 系統。它把長期記憶外置為有來源支撐的因果卡片,並使用本地 CoreBrain 加 deterministic body solvers 在記憶上回答問題。在已提交的 Agent Memory Benchmark 證據中,系統在 LongMemEval S 達到 500/500,並在 MemSim 六個 split 全部達到 100%。 + These are benchmark-scoped results. They are not an AGI claim, not production write-back approval, and not proof that every memory benchmark is solved. -## Results +這些結果只限於 benchmark 證據邊界內。它們不是 AGI 宣稱,不是 production write-back 授權,也不是所有記憶 benchmark 都已解決的證明。 + +## Results / 結果 | Benchmark | Split | Score | Status | | --- | --- | ---: | --- | -| LongMemEval | S | 500/500 | PR open, mergeable | -| MemSim | simple | 200/200 | PR open, mergeable | -| MemSim | conditional | 200/200 | PR open, mergeable | -| MemSim | comparative | 294/294 | PR open, mergeable | -| MemSim | aggregative | 275/275 | PR open, mergeable | -| MemSim | post_processing | 200/200 | PR open, mergeable | -| MemSim | noisy | 200/200 | PR open, mergeable | +| LongMemEval | S | 500/500 | PR open, mergeable / PR 已開、可合併 | +| MemSim | simple | 200/200 | PR open, mergeable / PR 已開、可合併 | +| MemSim | conditional | 200/200 | PR open, mergeable / PR 已開、可合併 | +| MemSim | comparative | 294/294 | PR open, mergeable / PR 已開、可合併 | +| MemSim | aggregative | 275/275 | PR open, mergeable / PR 已開、可合併 | +| MemSim | post_processing | 200/200 | PR open, mergeable / PR 已開、可合併 | +| MemSim | noisy | 200/200 | PR open, mergeable / PR 已開、可合併 | -## Design Difference +## Design Difference / 設計差異 The winning path is not generic long-context recall. +成功路徑不是泛用長上下文回憶。 + ```text question -> target memory schema @@ -40,29 +48,49 @@ question -> typed solver proof -> deterministic answer composer -> official judge + +問題 +-> 目標記憶 schema +-> CKB typed cards +-> 真實證據守門 +-> typed solver proof +-> deterministic answer composer +-> official judge ``` The key invariant is evidence authority: +關鍵不變量是證據權限: + ```text real memory evidence > typed causal card > solver proof > composer + +真實記憶證據 > typed causal card > solver proof > composer ``` Solver proof is useful for deterministic reasoning, but it must not become evidence. -## Not Ordinary RAG +Solver proof 對 deterministic reasoning 有用,但它不能變成 evidence。 + +## Not Ordinary RAG / 不是普通 RAG Ordinary RAG retrieves text and asks a model to answer. FuFront-LifeBrain-MEM stores memory as typed causal cards, separates evidence from inference, and uses deterministic gates for absence, temporal ordering, aggregation, and final answer composition. -## Limitations +普通 RAG 通常是檢索文本再讓模型回答。FuFront-LifeBrain-MEM 把記憶存成 typed causal cards,分離 evidence 與 inference,並用 deterministic gates 處理 absence、temporal ordering、aggregation 與 final answer composition。 + +## Limitations / 限制 - Upstream PRs are still pending merge. +- upstream PR 仍等待合併。 - Public leaderboard deployment depends on upstream maintainers. +- 官方榜單部署取決於 upstream 維護者。 - Current evidence is strongest for LongMemEval S and MemSim. +- 目前最強證據集中在 LongMemEval S 與 MemSim。 - Other memory benchmarks require separate official evidence. +- 其他記憶 benchmark 需要獨立 official evidence。 - Local replay scores must not be substituted for official judge scores. - +- 本地 replay 分數不能替代 official judge 分數。 diff --git a/FuFront-LifeBrain-MEM/README.md b/FuFront-LifeBrain-MEM/README.md index a694f16..29fd4a7 100644 --- a/FuFront-LifeBrain-MEM/README.md +++ b/FuFront-LifeBrain-MEM/README.md @@ -2,35 +2,39 @@ Public evidence folder for Fufront-RyanX / LifeBrain memory benchmark results. +FuFront-LifeBrain-MEM 是 Fufront-RyanX / LifeBrain 記憶基準測試的公開證據資料夾。 + This folder is intentionally scoped. It contains public benchmark evidence, reproduction boundaries, and open-source staging notes. It does not contain private memory, API keys, raw user data, unreleased model weights, or production write-back configuration. -## Current Evidence +這個資料夾刻意保持邊界清楚:只放公開 benchmark 證據、重現邊界與分階段開源說明。不包含私有記憶、API key、原始使用者資料、未公開模型權重或 production write-back 設定。 + +## Current Evidence / 目前證據 ### LongMemEval S -- Run name: `Fufront-RyanX` -- Memory provider: `ckb` -- Answer path: `corebrain:ckb-body-v1` -- Judge: `openai:gpt-4o` -- Oracle: `false` -- Total queries: `500` -- Correct: `500` -- Accuracy: `100.0%` -- Artifact: `outputs/longmemeval/Fufront-RyanX/rag/s.json.gz` +- Run name / 運行名稱: `Fufront-RyanX` +- Memory provider / 記憶系統: `ckb` +- Answer path / 回答路徑: `corebrain:ckb-body-v1` +- Judge / 裁判: `openai:gpt-4o` +- Oracle / 是否 oracle: `false` +- Total queries / 題數: `500` +- Correct / 正確: `500` +- Accuracy / 準確率: `100.0%` +- Artifact / 結果檔: `outputs/longmemeval/Fufront-RyanX/rag/s.json.gz` - Artifact sha256: `bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a` -- Submission PR: +- Submission PR / 提交 PR: https://github.com/vectorize-io/agent-memory-benchmark/pull/18 ### MemSim -- Run name: `Fufront-RyanX` -- Memory provider: `ckb` -- Oracle: `false` -- Submission PR: +- Run name / 運行名稱: `Fufront-RyanX` +- Memory provider / 記憶系統: `ckb` +- Oracle / 是否 oracle: `false` +- Submission PR / 提交 PR: https://github.com/vectorize-io/agent-memory-benchmark/pull/17 | Split | Correct | Accuracy | @@ -42,26 +46,40 @@ write-back configuration. | post_processing | 200/200 | 100.0% | | noisy | 200/200 | 100.0% | -## Public Claim Boundary +## Public Claim Boundary / 對外宣稱邊界 -Safe claim: +Safe claim / 安全宣稱: ```text Fufront-RyanX CKB reached 500/500 on LongMemEval S using a local CoreBrain plus causal memory bank, with OpenAI GPT-4o used only as the official judge. + +Fufront-RyanX CKB 使用本地核心小腦 + 因果記憶庫,在 LongMemEval S 達到 +500/500;OpenAI GPT-4o 僅作為官方裁判使用。 ``` -Unsafe claims: +Unsafe claims / 不安全宣稱: - Do not claim AGI from these benchmark results. +- 不要用這些 benchmark 結果宣稱 AGI。 - Do not claim upstream leaderboard deployment before the PR is merged. +- upstream PR 合併前,不要宣稱官方榜單已部署。 - Do not treat local replay scores as official judge evidence. +- 不要把本地 replay 分數當成 official judge 證據。 - Do not claim production shared-memory write-back is unlocked. +- 不要宣稱 production shared-memory write-back 已解鎖。 - Do not publish private memory, raw traces, API keys, or credentials. +- 不要發布私有記憶、原始 trace、API key 或憑證。 + +## Architecture Summary / 架構摘要 + +FuFront-LifeBrain-MEM is not ordinary RAG. + +FuFront-LifeBrain-MEM 不是普通 RAG。 -## Architecture Summary +The intended architecture is: -FuFront-LifeBrain-MEM is not ordinary RAG. The intended architecture is: +目標架構是: ```text question @@ -71,16 +89,25 @@ question -> typed solver proof -> deterministic answer composer -> official judge + +問題 +-> 目標記憶 schema +-> 因果記憶庫 typed cards +-> 真實證據守門 +-> typed solver proof +-> deterministic answer composer +-> official judge ``` The core invariant is that solver-generated intermediate objects are not evidence. Evidence must come from source-grounded memory cards. -## Read Order +核心不變量:solver 生成的中間物不能反過來當 evidence。證據必須來自有來源支撐的記憶卡片。 + +## Read Order / 閱讀順序 1. `EVIDENCE_PACKET.json` 2. `PUBLIC_REPORT.md` 3. `REPRODUCTION.md` 4. `OPEN_SOURCE_PLAN.md` 5. `MANIFEST_SHA256.txt` - diff --git a/FuFront-LifeBrain-MEM/REPRODUCTION.md b/FuFront-LifeBrain-MEM/REPRODUCTION.md index 9f3e530..206cf8a 100644 --- a/FuFront-LifeBrain-MEM/REPRODUCTION.md +++ b/FuFront-LifeBrain-MEM/REPRODUCTION.md @@ -1,6 +1,6 @@ -# Reproduction and Verification Notes +# Reproduction and Verification Notes / 重現與驗證說明 -## Current Submission PRs +## Current Submission PRs / 目前提交 PR - LongMemEval: https://github.com/vectorize-io/agent-memory-benchmark/pull/18 @@ -9,29 +9,37 @@ Both were verified on 2026-06-02 as open, mergeable, and not merged. -## Manifest Files +兩個 PR 在 2026-06-02 驗證為 open、mergeable、not merged。 + +## Manifest Files / Manifest 檔案 The public benchmark entries are recorded in: +公開 benchmark 條目記錄於: + - `results-manifest.json` - `blob-manifest.json` - `.blob_manifest.json` -LongMemEval artifact: +LongMemEval artifact / LongMemEval 結果檔: ```text outputs/longmemeval/Fufront-RyanX/rag/s.json.gz sha256: bc692b10877d44a8669bbd1c10eef09ae333530c06235217170389820497ef1a ``` -## Official vs Local Boundary +## Official vs Local Boundary / 官方與本地邊界 Use official judge artifacts for public claims. Local replay is useful for debugging typed solvers and composer behavior, but it is not equivalent to official OpenAI judge scoring. +對外宣稱必須使用 official judge artifact。本地 replay 適合用來 debug typed solvers 與 composer 行為,但不等同於 official OpenAI judge scoring。 + ## What Should Be Added Before a Standalone Paper or Repo +## 獨立論文或 repo 前應補齊的證據 + 1. Full official command transcript. 2. Environment snapshot. 3. Warm local answer latency. @@ -44,3 +52,16 @@ official OpenAI judge scoring. - absence guard - deterministic composer +中文: + +1. 完整 official command transcript。 +2. 環境快照。 +3. warm local answer latency。 +4. official end-to-end latency。 +5. 已移除 secrets 與 private memory 的 per-question trace。 +6. Ablation table: + - retrieval baseline + - CKB typed cards + - typed solvers + - absence guard + - deterministic composer