diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a289d9..d215db8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,36 +1,62 @@ -name: CI - -on: - pull_request: - push: - branches: - - main - -jobs: - test-and-build: - runs-on: ubuntu-latest - - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Set up Node.js - uses: actions/setup-node@v4 - with: - node-version: 22 - cache: npm - - - name: Install dependencies - run: npm ci - - - name: Install Playwright browser - run: npx playwright install --with-deps chromium - - - name: Run unit tests - run: npm run test - - - name: Run accessibility E2E tests - run: npm run test:e2e - - - name: Build site - run: npm run build +name: CI + +on: + push: + branches: ['**'] + pull_request: + branches: [main] + +jobs: + build-and-test: + name: Build & Unit Tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '22' + cache: 'npm' + - run: npm ci + - run: npm run typecheck + - run: npm run build + - run: npm test + - uses: actions/upload-artifact@v4 + if: always() + with: + name: dist + path: dist/ + retention-days: 1 + + e2e: + name: Playwright E2E + runs-on: ubuntu-latest + needs: build-and-test + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '22' + cache: 'npm' + - run: npm ci + - name: Cache Playwright browsers + uses: actions/cache@v4 + id: pw-cache + with: + path: ~/.cache/ms-playwright + key: pw-${{ runner.os }}-${{ hashFiles('package-lock.json') }} + - name: Install Playwright browsers + if: steps.pw-cache.outputs.cache-hit != 'true' + run: npx playwright install --with-deps chromium + - name: Install system deps (cache hit path) + if: steps.pw-cache.outputs.cache-hit == 'true' + run: npx playwright install-deps chromium + - run: npm run build + - run: npm run test:e2e + env: + CI: 'true' + - uses: actions/upload-artifact@v4 + if: failure() + with: + name: playwright-report + path: playwright-report/ + retention-days: 7 diff --git a/package-lock.json b/package-lock.json index 4032be8..f724a95 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,18 +9,74 @@ "version": "0.0.0", "dependencies": { "heic2any": "^0.0.4", - "jszip": "^3.10.1" + "jszip": "^3.10.1", + "tesseract.js": "^5.1.1", + "xlsx": "https://cdn.sheetjs.com/xlsx-latest/xlsx-latest.tgz" }, "devDependencies": { "@axe-core/playwright": "^4.11.3", "@playwright/test": "^1.60.0", + "@types/papaparse": "^5.5.2", "axe-core": "^4.11.4", + "jsdom": "^29.1.1", + "papaparse": "^5.5.3", "playwright": "^1.60.0", "typescript": "~6.0.2", "vite": "^8.0.12", "vitest": "^4.1.6" } }, + "node_modules/@asamuzakjp/css-color": { + "version": "5.1.11", + "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-5.1.11.tgz", + "integrity": "sha512-KVw6qIiCTUQhByfTd78h2yD1/00waTmm9uy/R7Ck/ctUyAPj+AEDLkQIdJW0T8+qGgj3j5bpNKK7Q3G+LedJWg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@csstools/css-calc": "^3.2.0", + "@csstools/css-color-parser": "^4.1.0", + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/dom-selector": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-7.1.1.tgz", + "integrity": "sha512-67RZDnYRc8H/8MLDgQCDE//zoqVFwajkepHZgmXrbwybzXOEwOWGPYGmALYl9J2DOLfFPPs6kKCqmbzV895hTQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@asamuzakjp/generational-cache": "^1.0.1", + "@asamuzakjp/nwsapi": "^2.3.9", + "bidi-js": "^1.0.3", + "css-tree": "^3.2.1", + "is-potential-custom-element-name": "^1.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/generational-cache": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@asamuzakjp/generational-cache/-/generational-cache-1.0.1.tgz", + "integrity": "sha512-wajfB8KqzMCN2KGNFdLkReeHncd0AslUSrvHVvvYWuU8ghncRJoA50kT3zP9MVL0+9g4/67H+cdvBskj9THPzg==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/@asamuzakjp/nwsapi": { + "version": "2.3.9", + "resolved": "https://registry.npmjs.org/@asamuzakjp/nwsapi/-/nwsapi-2.3.9.tgz", + "integrity": "sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==", + "dev": true, + "license": "MIT" + }, "node_modules/@axe-core/playwright": { "version": "4.11.3", "resolved": "https://registry.npmjs.org/@axe-core/playwright/-/playwright-4.11.3.tgz", @@ -34,6 +90,159 @@ "playwright-core": ">= 1.0.0" } }, + "node_modules/@bramus/specificity": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/@bramus/specificity/-/specificity-2.4.2.tgz", + "integrity": "sha512-ctxtJ/eA+t+6q2++vj5j7FYX3nRu311q1wfYH3xjlLOsczhlhxAg2FWNUXhpGvAw3BWo1xBcvOV6/YLc2r5FJw==", + "dev": true, + "license": "MIT", + "dependencies": { + "css-tree": "^3.0.0" + }, + "bin": { + "specificity": "bin/cli.js" + } + }, + "node_modules/@csstools/color-helpers": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-6.0.2.tgz", + "integrity": "sha512-LMGQLS9EuADloEFkcTBR3BwV/CGHV7zyDxVRtVDTwdI2Ca4it0CCVTT9wCkxSgokjE5Ho41hEPgb8OEUwoXr6Q==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "engines": { + "node": ">=20.19.0" + } + }, + "node_modules/@csstools/css-calc": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-3.2.1.tgz", + "integrity": "sha512-DtdHlgXh5ZkA43cwBcAm+huzgJiwx3ZTWVjBs94kwz2xKqSimDA3lBgCjphYgwgVUMWatSM0pDd8TILB1yrVVg==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-color-parser": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-4.1.1.tgz", + "integrity": "sha512-eZ5XOtyhK+mggRafYUWzA0tvaYOFgdY8AkgQiCJF9qNAePnUo/zmsqqYubBBb3sQ8uNUaSKTY9s9klfRaAXL0g==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "dependencies": { + "@csstools/color-helpers": "^6.0.2", + "@csstools/css-calc": "^3.2.1" + }, + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^4.0.0", + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-parser-algorithms": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-4.0.0.tgz", + "integrity": "sha512-+B87qS7fIG3L5h3qwJ/IFbjoVoOe/bpOdh9hAjXbvx0o8ImEmUsGXN0inFOnk2ChCFgqkkGFQ+TpM5rbhkKe4w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + }, + "peerDependencies": { + "@csstools/css-tokenizer": "^4.0.0" + } + }, + "node_modules/@csstools/css-syntax-patches-for-csstree": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.1.4.tgz", + "integrity": "sha512-wgsqt92b7C7tQhIdPNxj0n9zuUbQlvAuI1exyzeNrOKOi62SD7ren8zqszmpVREjAOqg8cD2FqYhQfAuKjk4sw==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT-0", + "peerDependencies": { + "css-tree": "^3.2.1" + }, + "peerDependenciesMeta": { + "css-tree": { + "optional": true + } + } + }, + "node_modules/@csstools/css-tokenizer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-4.0.0.tgz", + "integrity": "sha512-QxULHAm7cNu72w97JUNCBFODFaXpbDg+dP8b/oWFAZ2MTRppA3U00Y2L1HqaS4J6yBqxwa/Y3nMBaxVKbB/NsA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "license": "MIT", + "engines": { + "node": ">=20.19.0" + } + }, "node_modules/@emnapi/core": { "version": "1.10.0", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz", @@ -68,6 +277,24 @@ "tslib": "^2.4.0" } }, + "node_modules/@exodus/bytes": { + "version": "1.15.1", + "resolved": "https://registry.npmjs.org/@exodus/bytes/-/bytes-1.15.1.tgz", + "integrity": "sha512-S6mL0yNB/Abt9Ei4tq8gDhcczc4S3+vQ4ra7vxnAf+YHC02srtqxKKZghx2Dq6p0e66THKwR6r8N6P95wEty7Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + }, + "peerDependencies": { + "@noble/hashes": "^1.8.0 || ^2.0.0" + }, + "peerDependenciesMeta": { + "@noble/hashes": { + "optional": true + } + } + }, "node_modules/@jridgewell/sourcemap-codec": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", @@ -427,6 +654,26 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/node": { + "version": "25.9.1", + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.9.1.tgz", + "integrity": "sha512-xfrlY7UD5rMJk3ZVJP8BNzS28J36YJg+xp+LPXV1TdWxr8uMH5A860QNxYDGQe/ylDSgjxE52Q9VnO7p75tJxg==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": ">=7.24.0 <7.24.7" + } + }, + "node_modules/@types/papaparse": { + "version": "5.5.2", + "resolved": "https://registry.npmjs.org/@types/papaparse/-/papaparse-5.5.2.tgz", + "integrity": "sha512-gFnFp/JMzLHCwRf7tQHrNnfhN4eYBVYYI897CGX4MY1tzY9l2aLkVyx2IlKZ/SAqDbB3I1AOZW5gTMGGsqWliA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@vitest/expect": { "version": "4.1.6", "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-4.1.6.tgz", @@ -560,6 +807,22 @@ "node": ">=4" } }, + "node_modules/bidi-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz", + "integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==", + "dev": true, + "license": "MIT", + "dependencies": { + "require-from-string": "^2.0.2" + } + }, + "node_modules/bmp-js": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/bmp-js/-/bmp-js-0.1.0.tgz", + "integrity": "sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==", + "license": "MIT" + }, "node_modules/chai": { "version": "6.2.2", "resolved": "https://registry.npmjs.org/chai/-/chai-6.2.2.tgz", @@ -583,6 +846,79 @@ "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", "license": "MIT" }, + "node_modules/css-tree": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.2.1.tgz", + "integrity": "sha512-X7sjQzceUhu1u7Y/ylrRZFU2FS6LRiFVp6rKLPg23y3x3c3DOKAwuXGDp+PAGjh6CSnCjYeAul8pcT8bAl+lSA==", + "dev": true, + "license": "MIT", + "dependencies": { + "mdn-data": "2.27.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0" + } + }, + "node_modules/data-urls": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-7.0.0.tgz", + "integrity": "sha512-23XHcCF+coGYevirZceTVD7NdJOqVn+49IHyxgszm+JIiHLoB2TkmPtsYkNWT1pvRSGkc35L6NHs0yHkN2SumA==", + "dev": true, + "license": "MIT", + "dependencies": { + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/data-urls/node_modules/tr46": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-6.0.0.tgz", + "integrity": "sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==", + "dev": true, + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/data-urls/node_modules/webidl-conversions": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", + "integrity": "sha512-BMhLD/Sw+GbJC21C/UgyaZX41nPt8bUTg+jWyDeg7e7YN4xOM05YPSIXceACnXVtqyEw/LMClUQMtMZ+PGGpqQ==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=20" + } + }, + "node_modules/data-urls/node_modules/whatwg-url": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-16.0.1.tgz", + "integrity": "sha512-1to4zXBxmXHV3IiSSEInrreIlu02vUOvrhxJJH5vcxYTBDAx51cqZiKdyTxlecdKNSjj8EcxGBxNf6Vg+945gw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.11.0", + "tr46": "^6.0.0", + "webidl-conversions": "^8.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/decimal.js": { + "version": "10.6.0", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.6.0.tgz", + "integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==", + "dev": true, + "license": "MIT" + }, "node_modules/detect-libc": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", @@ -593,6 +929,19 @@ "node": ">=8" } }, + "node_modules/entities": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-8.0.0.tgz", + "integrity": "sha512-zwfzJecQ/Uej6tusMqwAqU/6KL2XaB2VZ2Jg54Je6ahNBGNH6Ek6g3jjNCF0fG9EWQKGZNddNjU5F1ZQn/sBnA==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=20.19.0" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/es-module-lexer": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.1.0.tgz", @@ -659,6 +1008,25 @@ "integrity": "sha512-3lLnZiDELfabVH87htnRolZ2iehX9zwpRyGNz22GKXIu0fznlblf0/ftppXKNqS26dqFSeqfIBhAmAj/uSp0cA==", "license": "MIT" }, + "node_modules/html-encoding-sniffer": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-6.0.0.tgz", + "integrity": "sha512-CV9TW3Y3f8/wT0BRFc1/KAVQ3TUHiXmaAb6VW9vtiMFf7SLoMd1PdAc4W3KFOFETBJUb90KatHqlsZMWV+R9Gg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.6.0" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, + "node_modules/idb-keyval": { + "version": "6.2.4", + "resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-6.2.4.tgz", + "integrity": "sha512-D/NzHWUmYJGXi++z67aMSrnisb9A3621CyRK5G89JyTlN13C8xf0g04DLxUKMufPem3e3L2JAXR6Z00OWy183Q==", + "license": "Apache-2.0" + }, "node_modules/immediate": { "version": "3.0.6", "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", @@ -671,12 +1039,110 @@ "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", "license": "ISC" }, + "node_modules/is-electron": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/is-electron/-/is-electron-2.2.2.tgz", + "integrity": "sha512-FO/Rhvz5tuw4MCWkpMzHFKWD2LsfHzIb7i6MdPYZ/KW7AlxawyLkqdy+jPZP1WubqEADE3O4FUENlJHDfQASRg==", + "license": "MIT" + }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/is-url": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz", + "integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==", + "license": "MIT" + }, "node_modules/isarray": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", "license": "MIT" }, + "node_modules/jsdom": { + "version": "29.1.1", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-29.1.1.tgz", + "integrity": "sha512-ECi4Fi2f7BdJtUKTflYRTiaMxIB0O6zfR1fX0GXpUrf6flp8QIYn1UT20YQqdSOfk2dfkCwS8LAFoJDEppNK5Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@asamuzakjp/css-color": "^5.1.11", + "@asamuzakjp/dom-selector": "^7.1.1", + "@bramus/specificity": "^2.4.2", + "@csstools/css-syntax-patches-for-csstree": "^1.1.3", + "@exodus/bytes": "^1.15.0", + "css-tree": "^3.2.1", + "data-urls": "^7.0.0", + "decimal.js": "^10.6.0", + "html-encoding-sniffer": "^6.0.0", + "is-potential-custom-element-name": "^1.0.1", + "lru-cache": "^11.3.5", + "parse5": "^8.0.1", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^6.0.1", + "undici": "^7.25.0", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^8.0.1", + "whatwg-mimetype": "^5.0.0", + "whatwg-url": "^16.0.1", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": "^20.19.0 || ^22.13.0 || >=24.0.0" + }, + "peerDependencies": { + "canvas": "^3.0.0" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/tr46": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-6.0.0.tgz", + "integrity": "sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==", + "dev": true, + "license": "MIT", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=20" + } + }, + "node_modules/jsdom/node_modules/webidl-conversions": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.1.tgz", + "integrity": "sha512-BMhLD/Sw+GbJC21C/UgyaZX41nPt8bUTg+jWyDeg7e7YN4xOM05YPSIXceACnXVtqyEw/LMClUQMtMZ+PGGpqQ==", + "dev": true, + "license": "BSD-2-Clause", + "engines": { + "node": ">=20" + } + }, + "node_modules/jsdom/node_modules/whatwg-url": { + "version": "16.0.1", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-16.0.1.tgz", + "integrity": "sha512-1to4zXBxmXHV3IiSSEInrreIlu02vUOvrhxJJH5vcxYTBDAx51cqZiKdyTxlecdKNSjj8EcxGBxNf6Vg+945gw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@exodus/bytes": "^1.11.0", + "tr46": "^6.0.0", + "webidl-conversions": "^8.0.1" + }, + "engines": { + "node": "^20.19.0 || ^22.12.0 || >=24.0.0" + } + }, "node_modules/jszip": { "version": "3.10.1", "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", @@ -959,6 +1425,16 @@ "url": "https://opencollective.com/parcel" } }, + "node_modules/lru-cache": { + "version": "11.5.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.5.0.tgz", + "integrity": "sha512-5YgH9UJd7wVb9hIouI2adWpgqrrICkt070Dnj8EUY1+B4B2P9eRLPAkAAo6NICA7CEhOIeBHl46u9zSNpNu7zA==", + "dev": true, + "license": "BlueOak-1.0.0", + "engines": { + "node": "20 || >=22" + } + }, "node_modules/magic-string": { "version": "0.30.21", "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.21.tgz", @@ -969,6 +1445,13 @@ "@jridgewell/sourcemap-codec": "^1.5.5" } }, + "node_modules/mdn-data": { + "version": "2.27.1", + "resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.27.1.tgz", + "integrity": "sha512-9Yubnt3e8A0OKwxYSXyhLymGW4sCufcLG6VdiDdUGVkPhpqLxlvP5vl1983gQjJl3tqbrM731mjaZaP68AgosQ==", + "dev": true, + "license": "CC0-1.0" + }, "node_modules/nanoid": { "version": "3.3.12", "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.12.tgz", @@ -988,6 +1471,26 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, "node_modules/obug": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/obug/-/obug-2.1.1.tgz", @@ -999,12 +1502,41 @@ ], "license": "MIT" }, + "node_modules/opencollective-postinstall": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.3.tgz", + "integrity": "sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==", + "license": "MIT", + "bin": { + "opencollective-postinstall": "index.js" + } + }, "node_modules/pako": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", "license": "(MIT AND Zlib)" }, + "node_modules/papaparse": { + "version": "5.5.3", + "resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.5.3.tgz", + "integrity": "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A==", + "dev": true, + "license": "MIT" + }, + "node_modules/parse5": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.1.tgz", + "integrity": "sha512-z1e/HMG90obSGeidlli3hj7cbocou0/wa5HacvI3ASx34PecNjNQeaHNo5WIZpWofN9kgkqV1q5YvXe3F0FoPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "entities": "^8.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/pathe": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/pathe/-/pathe-2.0.3.tgz", @@ -1114,6 +1646,16 @@ "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", "license": "MIT" }, + "node_modules/punycode": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", + "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/readable-stream": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", @@ -1129,6 +1671,22 @@ "util-deprecate": "~1.0.1" } }, + "node_modules/regenerator-runtime": { + "version": "0.13.11", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.11.tgz", + "integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==", + "license": "MIT" + }, + "node_modules/require-from-string": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz", + "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/rolldown": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/rolldown/-/rolldown-1.0.1.tgz", @@ -1169,6 +1727,19 @@ "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", "license": "MIT" }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "dev": true, + "license": "ISC", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, "node_modules/setimmediate": { "version": "1.0.5", "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", @@ -1215,6 +1786,38 @@ "safe-buffer": "~5.1.0" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==", + "dev": true, + "license": "MIT" + }, + "node_modules/tesseract.js": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-5.1.1.tgz", + "integrity": "sha512-lzVl/Ar3P3zhpUT31NjqeCo1f+D5+YfpZ5J62eo2S14QNVOmHBTtbchHm/YAbOOOzCegFnKf4B3Qih9LuldcYQ==", + "hasInstallScript": true, + "license": "Apache-2.0", + "dependencies": { + "bmp-js": "^0.1.0", + "idb-keyval": "^6.2.0", + "is-electron": "^2.2.2", + "is-url": "^1.2.4", + "node-fetch": "^2.6.9", + "opencollective-postinstall": "^2.0.3", + "regenerator-runtime": "^0.13.3", + "tesseract.js-core": "^5.1.1", + "wasm-feature-detect": "^1.2.11", + "zlibjs": "^0.3.1" + } + }, + "node_modules/tesseract.js-core": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-5.1.1.tgz", + "integrity": "sha512-KX3bYSU5iGcO1XJa+QGPbi+Zjo2qq6eBhNjSGR5E5q0JtzkoipJKOUQD7ph8kFyteCEfEQ0maWLu8MCXtvX5uQ==", + "license": "Apache-2.0" + }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", @@ -1259,6 +1862,45 @@ "node": ">=14.0.0" } }, + "node_modules/tldts": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-7.4.0.tgz", + "integrity": "sha512-yHBe+zVfzNZ3QfTPW/Z6KK1G2t340gFjMHqI/4KKSt/abzYydzuCnpqdaF5gCCABby+9Yfbj59oR5F2Fd5CBzg==", + "dev": true, + "license": "MIT", + "dependencies": { + "tldts-core": "^7.4.0" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, + "node_modules/tldts-core": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.4.0.tgz", + "integrity": "sha512-/mb9kRld+x1sIMXxWNOAp5m6C+D4GrAORWlJkOJ5dElvxdN1eutz/o7qHLp9gFvDF4Y3/L2xeScoxz6AbEo8rQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/tough-cookie": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-6.0.1.tgz", + "integrity": "sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw==", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "tldts": "^7.0.5" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, "node_modules/tslib": { "version": "2.8.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", @@ -1281,6 +1923,23 @@ "node": ">=14.17" } }, + "node_modules/undici": { + "version": "7.26.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.26.0.tgz", + "integrity": "sha512-3O9Tf67pGhgOv9jM35AbhkXAKi13f3oy3aE4CSgr+TckGeY+/iu97ZXN+J7DpHPzLbVApFd1IFhcnBjREYXYcg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + }, + "node_modules/undici-types": { + "version": "7.24.6", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.24.6.tgz", + "integrity": "sha512-WRNW+sJgj5OBN4/0JpHFqtqzhpbnV0GuB+OozA9gCL7a993SmU+1JBZCzLNxYsbMfIeDL+lTsphD5jN5N+n0zg==", + "dev": true, + "license": "MIT" + }, "node_modules/util-deprecate": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", @@ -1455,6 +2114,51 @@ } } }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "dev": true, + "license": "MIT", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/wasm-feature-detect": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/wasm-feature-detect/-/wasm-feature-detect-1.8.0.tgz", + "integrity": "sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==", + "license": "Apache-2.0" + }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-mimetype": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-5.0.0.tgz", + "integrity": "sha512-sXcNcHOC51uPGF0P/D4NVtrkjSU2fNsm9iog4ZvZJsL3rjoDAzXZhkm2MWt1y+PUdggKAYVoMAIYcs78wJ51Cw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20" + } + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/why-is-node-running": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", @@ -1471,6 +2175,44 @@ "engines": { "node": ">=8" } + }, + "node_modules/xlsx": { + "version": "0.20.3", + "resolved": "https://cdn.sheetjs.com/xlsx-latest/xlsx-latest.tgz", + "integrity": "sha512-oLDq3jw7AcLqKWH2AhCpVTZl8mf6X2YReP+Neh0SJUzV/BdZYjth94tG5toiMB1PPrYtxOCfaoUCkvtuH+3AJA==", + "license": "Apache-2.0", + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18" + } + }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==", + "dev": true, + "license": "MIT" + }, + "node_modules/zlibjs": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz", + "integrity": "sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==", + "license": "MIT", + "engines": { + "node": "*" + } } } } diff --git a/package.json b/package.json index 6508ba1..f3ab39c 100644 --- a/package.json +++ b/package.json @@ -9,12 +9,16 @@ "preview": "vite preview", "test": "vitest run", "test:watch": "vitest", - "test:e2e": "playwright test" + "test:e2e": "playwright test", + "typecheck": "tsc --noEmit" }, "devDependencies": { "@axe-core/playwright": "^4.11.3", "@playwright/test": "^1.60.0", + "@types/papaparse": "^5.5.2", "axe-core": "^4.11.4", + "jsdom": "^29.1.1", + "papaparse": "^5.5.3", "playwright": "^1.60.0", "typescript": "~6.0.2", "vite": "^8.0.12", @@ -22,6 +26,8 @@ }, "dependencies": { "heic2any": "^0.0.4", - "jszip": "^3.10.1" + "jszip": "^3.10.1", + "tesseract.js": "^5.1.1", + "xlsx": "https://cdn.sheetjs.com/xlsx-latest/xlsx-latest.tgz" } } diff --git a/playwright.config.ts b/playwright.config.ts index b8cdedb..ab30f79 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -8,12 +8,11 @@ export default defineConfig({ baseURL: 'http://127.0.0.1:4173', trace: 'on-first-retry', }, + // NOTE: CI must run `npm run build` before `npm run test:e2e` so dist/ exists for preview. webServer: { - command: 'npm run dev -- --host 127.0.0.1 --port 4173', + command: 'npm run preview -- --port 4173 --strictPort --host 127.0.0.1', url: 'http://127.0.0.1:4173', - reuseExistingServer: true, - stdout: 'ignore', - stderr: 'pipe', + reuseExistingServer: !process.env.CI, timeout: 120_000, }, projects: [ diff --git a/src/lib/export.test.ts b/src/lib/export.test.ts new file mode 100644 index 0000000..0b6037a --- /dev/null +++ b/src/lib/export.test.ts @@ -0,0 +1,358 @@ +import { describe, it, expect } from 'vitest'; +import Papa from 'papaparse'; +import { buildExport } from './export'; +import type { OcrResult } from './ocr-types'; + +// ── Helpers ────────────────────────────────────────────────────────────────── + +async function blobText(blob: Blob): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => resolve(reader.result as string); + reader.onerror = reject; + reader.readAsText(blob); + }); +} + +async function blobBytes(blob: Blob): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onload = () => resolve(reader.result as ArrayBuffer); + reader.onerror = reject; + reader.readAsArrayBuffer(blob); + }); +} + +// ── Fixtures ────────────────────────────────────────────────────────────────── + +const r1: OcrResult = { + id: '1', + fileName: 'photo one.jpg', + text: 'Hello World', + confidence: 0.95, +}; + +const r2: OcrResult = { + id: '2', + fileName: 'photo_two.png', + text: 'Line1\nLine2', + confidence: 0.88, +}; + +const r3: OcrResult = { + id: '3', + fileName: 'scan, third.jpg', + text: 'Text with "quotes" and, commas', + confidence: 0.72, +}; + +const rReceipt1: OcrResult = { + id: 'r1', + fileName: 'receipt1.jpg', + text: 'ACME Store\n2025-01-15\nApples 2 x 1.50 3.00\nBread 2.50\nSubtotal 5.50\nTax 0.45\nTotal 5.95', + confidence: 0.9, + parsed: { + merchant: 'ACME Store', + date: '2025-01-15', + currency: 'USD', + items: [ + { description: 'Apples', quantity: 2, unitPrice: 1.5, total: 3.0 }, + { description: 'Bread', total: 2.5 }, + ], + subtotal: 5.5, + tax: 0.45, + total: 5.95, + }, +}; + +const rReceipt2: OcrResult = { + id: 'r2', + fileName: 'receipt2.jpg', + text: 'Corner Deli\n2025-03-10\nCoffee 1.80\nTotal 1.80', + confidence: 0.85, + parsed: { + merchant: 'Corner Deli', + date: '2025-03-10', + currency: 'USD', + items: [ + { description: 'Coffee', total: 1.8 }, + ], + total: 1.8, + }, +}; + +const rNoParsed: OcrResult = { + id: 'np', + fileName: 'no-parse.jpg', + text: 'Some unrecognized text', + confidence: 0.5, + // no parsed field +}; + +// ── TXT tests ───────────────────────────────────────────────────────────────── + +describe('buildExport — TXT', () => { + it('case 1: single result combined=false → 1 artifact with base filename', async () => { + const artifacts = await buildExport([r1], 'txt', { combined: false }); + expect(artifacts).toHaveLength(1); + expect(artifacts[0].fileName).toBe('photo one.txt'); + const text = await blobText(artifacts[0].blob); + expect(text).toBe(r1.text); + }); + + it('case 2: three results combined=true → 1 artifact with === headers', async () => { + const artifacts = await buildExport([r1, r2, r3], 'txt', { combined: true }); + expect(artifacts).toHaveLength(1); + expect(artifacts[0].fileName).toBe('ocr-results.txt'); + const text = await blobText(artifacts[0].blob); + expect(text).toContain('=== photo one.jpg ===\n'); + expect(text).toContain('=== photo_two.png ===\n'); + expect(text).toContain('=== scan, third.jpg ===\n'); + expect(text).toContain(r1.text); + expect(text).toContain(r2.text); + expect(text).toContain(r3.text); + }); + + it('case 3: three results combined=false → 3 artifacts', async () => { + const artifacts = await buildExport([r1, r2, r3], 'txt', { combined: false }); + expect(artifacts).toHaveLength(3); + }); + + it('case 4: sanitizes weird filenames (spaces, colons)', async () => { + const weirdResult: OcrResult = { + id: 'w', + fileName: 'C:\\My File: Test?.jpg', + text: 'abc', + confidence: 0.9, + }; + const artifacts = await buildExport([weirdResult], 'txt', { combined: false }); + expect(artifacts[0].fileName).not.toContain(':'); + expect(artifacts[0].fileName).not.toContain('\\'); + expect(artifacts[0].fileName).not.toContain('?'); + expect(artifacts[0].fileName).toMatch(/\.txt$/); + }); +}); + +// ── CSV free-form tests ─────────────────────────────────────────────────────── + +describe('buildExport — CSV (free-form)', () => { + it('case 5: header row equals fileName,confidence,text', async () => { + const artifacts = await buildExport([r1], 'csv', { combined: true }); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse(text, { header: false }); + expect(parsed.data[0]).toEqual(['fileName', 'confidence', 'text']); + }); + + it('case 6: multi-line text is quoted and preserves newlines', async () => { + const artifacts = await buildExport([r2], 'csv', { combined: false }); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse>(text, { header: true }); + expect(parsed.data[0].text).toBe('Line1\nLine2'); + }); + + it('case 7: cells with commas are quoted', async () => { + const commaResult: OcrResult = { + id: 'c', + fileName: 'file.jpg', + text: 'hello, world', + confidence: 0.8, + }; + const artifacts = await buildExport([commaResult], 'csv', { combined: false }); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse>(text, { header: true }); + expect(parsed.data[0].text).toBe('hello, world'); + }); + + it('case 8: embedded double-quotes get doubled (RFC 4180)', async () => { + const quotedResult: OcrResult = { + id: 'q', + fileName: 'file.jpg', + text: 'say "hello"', + confidence: 0.8, + }; + const artifacts = await buildExport([quotedResult], 'csv', { combined: false }); + const raw = await blobText(artifacts[0].blob); + // The raw CSV must contain doubled quotes + expect(raw).toContain('""'); + // Round-trip must recover the original + const parsed = Papa.parse>(raw, { header: true }); + expect(parsed.data[0].text).toBe('say "hello"'); + }); + + it('case 9a: combined=true → 1 artifact ocr-results.csv', async () => { + const artifacts = await buildExport([r1, r2], 'csv', { combined: true }); + expect(artifacts).toHaveLength(1); + expect(artifacts[0].fileName).toBe('ocr-results.csv'); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse>(text, { header: true }); + expect(parsed.data).toHaveLength(2); + }); + + it('case 9b: combined=false → 1 artifact per file, single-row CSV', async () => { + const artifacts = await buildExport([r1, r2, r3], 'csv', { combined: false }); + expect(artifacts).toHaveLength(3); + for (const artifact of artifacts) { + expect(artifact.fileName).toMatch(/\.csv$/); + const text = await blobText(artifact.blob); + const parsed = Papa.parse>(text, { header: true }); + expect(parsed.data).toHaveLength(1); + expect(parsed.meta.fields).toEqual(['fileName', 'confidence', 'text']); + } + }); +}); + +// ── CSV receipt mode tests ──────────────────────────────────────────────────── + +describe('buildExport — CSV (receipt mode)', () => { + const RECEIPT_COLUMNS = [ + 'fileName', 'merchant', 'date', 'currency', + 'description', 'quantity', 'unitPrice', 'lineTotal', + 'subtotal', 'tax', 'total', + ]; + + it('case 10: columns match receipt schema, one row per line item', async () => { + const artifacts = await buildExport([rReceipt1], 'csv', { combined: true, receiptMode: true }); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse>(text, { header: true }); + expect(parsed.meta.fields).toEqual(RECEIPT_COLUMNS); + // rReceipt1 has 2 items → 2 rows + expect(parsed.data).toHaveLength(2); + // First row carries subtotal/tax/total + expect(parsed.data[0].subtotal).toBe('5.5'); + expect(parsed.data[0].tax).toBe('0.45'); + expect(parsed.data[0].total).toBe('5.95'); + // Second row leaves those empty + expect(parsed.data[1].subtotal).toBe(''); + expect(parsed.data[1].tax).toBe(''); + expect(parsed.data[1].total).toBe(''); + }); + + it('case 11: result with no parsed → emit single row with empty item fields', async () => { + const artifacts = await buildExport([rNoParsed], 'csv', { combined: true, receiptMode: true }); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse>(text, { header: true }); + expect(parsed.data).toHaveLength(1); + expect(parsed.data[0].fileName).toBe('no-parse.jpg'); + expect(parsed.data[0].description).toBe(''); + expect(parsed.data[0].total).toBe(''); + }); + + it('case 12: round-trip parse → one row per item across all results', async () => { + const artifacts = await buildExport([rReceipt1, rReceipt2], 'csv', { + combined: true, + receiptMode: true, + }); + const text = await blobText(artifacts[0].blob); + const parsed = Papa.parse>(text, { header: true }); + // rReceipt1 has 2 items, rReceipt2 has 1 item → 3 rows total + expect(parsed.data).toHaveLength(3); + const descriptions = parsed.data.map(r => r.description); + expect(descriptions).toContain('Apples'); + expect(descriptions).toContain('Bread'); + expect(descriptions).toContain('Coffee'); + }); +}); + +// ── XLSX tests ──────────────────────────────────────────────────────────────── + +describe('buildExport — XLSX', () => { + async function readWorkbook(blob: Blob) { + const XLSX = await import('xlsx'); + const buf = await blobBytes(blob); + return XLSX.read(buf, { type: 'array' }); + } + + it('case 13: free-form single result → 1 sheet, cells match', async () => { + const artifacts = await buildExport([r1], 'xlsx', { combined: false }); + expect(artifacts).toHaveLength(1); + const wb = await readWorkbook(artifacts[0].blob); + expect(wb.SheetNames).toHaveLength(1); + const ws = wb.Sheets[wb.SheetNames[0]]; + const rows = (await import('xlsx')).utils.sheet_to_json(ws, { header: 1 }); + // Header row + expect(rows[0]).toEqual(['fileName', 'confidence', 'text']); + // Data row + expect(rows[1][0]).toBe(r1.fileName); + expect(rows[1][2]).toBe(r1.text); + }); + + it('case 14: three results → 3 sheets, name collisions get numeric suffix', async () => { + // Two results with same base name after sanitization + const dup1: OcrResult = { id: 'd1', fileName: 'receipt.jpg', text: 'a', confidence: 0.9 }; + const dup2: OcrResult = { id: 'd2', fileName: 'receipt.png', text: 'b', confidence: 0.8 }; + const dup3: OcrResult = { id: 'd3', fileName: 'other.jpg', text: 'c', confidence: 0.7 }; + const artifacts = await buildExport([dup1, dup2, dup3], 'xlsx', { combined: true }); + expect(artifacts).toHaveLength(1); + const wb = await readWorkbook(artifacts[0].blob); + expect(wb.SheetNames).toHaveLength(3); + expect(wb.SheetNames[0]).toBe('receipt'); + expect(wb.SheetNames[1]).toBe('receipt(1)'); + expect(wb.SheetNames[2]).toBe('other'); + }); + + it('case 14b: sheet names are ≤31 chars', async () => { + const longName: OcrResult = { + id: 'l', + fileName: 'this_is_a_very_long_filename_that_exceeds_31_characters.jpg', + text: 'x', + confidence: 0.9, + }; + const artifacts = await buildExport([longName], 'xlsx', { combined: false }); + const wb = await readWorkbook(artifacts[0].blob); + for (const name of wb.SheetNames) { + expect(name.length).toBeLessThanOrEqual(31); + } + }); + + it('case 15: receipt mode → header block then item table', async () => { + const artifacts = await buildExport([rReceipt1], 'xlsx', { + combined: false, + receiptMode: true, + }); + expect(artifacts).toHaveLength(1); + const wb = await readWorkbook(artifacts[0].blob); + const XLSX = await import('xlsx'); + const ws = wb.Sheets[wb.SheetNames[0]]; + const rows = XLSX.utils.sheet_to_json<(string | number | undefined)[]>(ws, { header: 1 }); + + // Header block: A col labels, B col values + const headerLabels = rows.slice(0, 6).map(r => r[0]); + expect(headerLabels).toContain('Merchant'); + expect(headerLabels).toContain('Date'); + expect(headerLabels).toContain('Currency'); + expect(headerLabels).toContain('Subtotal'); + expect(headerLabels).toContain('Tax'); + expect(headerLabels).toContain('Total'); + + // Find item table header + const tableHeaderIdx = rows.findIndex(r => + r[0] === 'Description' && + r[1] === 'Quantity' && + r[2] === 'Unit Price' && + r[3] === 'Line Total' + ); + expect(tableHeaderIdx).toBeGreaterThan(6); + + // Items follow after the header + const itemRows = rows.slice(tableHeaderIdx + 1).filter(r => r[0]); + expect(itemRows).toHaveLength(2); + expect(itemRows[0][0]).toBe('Apples'); + expect(itemRows[1][0]).toBe('Bread'); + }); + + it('case 16: combined=true → 1 xlsx artifact; combined=false → 1 per result', async () => { + const combined = await buildExport([r1, r2, r3], 'xlsx', { combined: true }); + expect(combined).toHaveLength(1); + expect(combined[0].fileName).toBe('ocr-results.xlsx'); + const wb = await readWorkbook(combined[0].blob); + expect(wb.SheetNames).toHaveLength(3); + + const separate = await buildExport([r1, r2, r3], 'xlsx', { combined: false }); + expect(separate).toHaveLength(3); + for (const art of separate) { + expect(art.fileName).toMatch(/\.xlsx$/); + const wbSingle = await readWorkbook(art.blob); + expect(wbSingle.SheetNames).toHaveLength(1); + } + }); +}); diff --git a/src/lib/export.ts b/src/lib/export.ts new file mode 100644 index 0000000..14ee349 --- /dev/null +++ b/src/lib/export.ts @@ -0,0 +1,365 @@ +import type { OcrResult, ExportFormat, ReceiptData } from './ocr-types'; + +export interface ExportOptions { + combined?: boolean; + receiptMode?: boolean; +} + +export interface ExportArtifact { + blob: Blob; + fileName: string; +} + +// ── Private helpers ─────────────────────────────────────────────────────────── + +/** + * Strip extension, trim whitespace, replace characters that are illegal in + * common filesystems (Windows + POSIX) with underscores. + * Illegal: \ / : * ? " < > | and control characters. + */ +function sanitizeBase(name: string): string { + // Strip extension + const lastDot = name.lastIndexOf('.'); + const base = lastDot > 0 ? name.slice(0, lastDot) : name; + return base.trim().replace(/[\\/:*?"<>|\x00-\x1f]/g, '_').trim() || 'file'; +} + +/** + * Produce a valid XLSX sheet name: + * - Replace characters illegal in sheet names: [ ] : * ? / \ and quotes + * - Trim to 31 characters (XLSX hard limit) + * - Deduplicate: if the name already exists in `used`, append (1), (2), … + */ +function sanitizeSheetName(name: string, used: Set): string { + // Replace illegal sheet-name chars + let safe = name.replace(/[\[\]:*?/\\'"]/g, '_').slice(0, 31); + if (!safe) safe = 'Sheet'; + + if (!used.has(safe)) { + used.add(safe); + return safe; + } + + let n = 1; + while (true) { + const suffix = `(${n})`; + // Trim base so the whole thing stays ≤ 31 chars + const candidate = safe.slice(0, 31 - suffix.length) + suffix; + if (!used.has(candidate)) { + used.add(candidate); + return candidate; + } + n++; + } +} + +/** + * RFC 4180 CSV cell escaping. + * Wrap in double-quotes if the value contains comma, newline, or double-quote. + * Double up any embedded double-quotes. + */ +function csvEscape(cell: string | number | undefined | null): string { + const s = cell == null ? '' : String(cell); + if (s.includes('"') || s.includes(',') || s.includes('\n') || s.includes('\r')) { + return '"' + s.replace(/"/g, '""') + '"'; + } + return s; +} + +function buildCsvRow(cells: (string | number | undefined | null)[]): string { + return cells.map(csvEscape).join(','); +} + +// ── TXT export ──────────────────────────────────────────────────────────────── + +function buildTxt(results: OcrResult[], combined: boolean): ExportArtifact[] { + if (combined) { + const parts = results.map(r => `=== ${r.fileName} ===\n${r.text}`); + const content = parts.join('\n\n'); + return [{ + blob: new Blob([content], { type: 'text/plain' }), + fileName: 'ocr-results.txt', + }]; + } + + return results.map(r => ({ + blob: new Blob([r.text], { type: 'text/plain' }), + fileName: sanitizeBase(r.fileName) + '.txt', + })); +} + +// ── CSV free-form export ────────────────────────────────────────────────────── + +const CSV_FREEFORM_HEADER = ['fileName', 'confidence', 'text']; + +function resultToCsvRow(r: OcrResult): string { + return buildCsvRow([r.fileName, r.confidence, r.text]); +} + +function buildCsvFreeform(results: OcrResult[], combined: boolean): ExportArtifact[] { + const headerLine = buildCsvRow(CSV_FREEFORM_HEADER); + + if (combined) { + const lines = [headerLine, ...results.map(resultToCsvRow)]; + return [{ + blob: new Blob([lines.join('\n')], { type: 'text/csv' }), + fileName: 'ocr-results.csv', + }]; + } + + return results.map(r => { + const lines = [headerLine, resultToCsvRow(r)]; + return { + blob: new Blob([lines.join('\n')], { type: 'text/csv' }), + fileName: sanitizeBase(r.fileName) + '.csv', + }; + }); +} + +// ── CSV receipt export ──────────────────────────────────────────────────────── + +const CSV_RECEIPT_HEADER = [ + 'fileName', 'merchant', 'date', 'currency', + 'description', 'quantity', 'unitPrice', 'lineTotal', + 'subtotal', 'tax', 'total', +]; + +/** + * Convert one OcrResult into receipt CSV rows. + * + * Convention (case 11): If a result has no `parsed`, emit a single row + * with fileName populated and all other fields empty. This is more useful + * than silently skipping — it preserves the file in the output so the user + * can see which files failed to parse. + * + * For results with items: the first row per result carries subtotal/tax/total; + * subsequent item rows leave those columns empty. + */ +function resultToReceiptRows(r: OcrResult): (string | number | undefined | null)[][] { + if (!r.parsed) { + // Emit one empty row so the file is represented + return [[r.fileName, '', '', '', '', '', '', '', '', '', '']]; + } + + const { parsed } = r; + const { merchant, date, currency, items, subtotal, tax, total } = parsed; + + if (items.length === 0) { + // No items but has parsed metadata + return [[r.fileName, merchant ?? '', date ?? '', currency ?? '', '', '', '', '', subtotal ?? '', tax ?? '', total ?? '']]; + } + + return items.map((item, idx) => { + const isFirst = idx === 0; + return [ + r.fileName, + isFirst ? (merchant ?? '') : '', + isFirst ? (date ?? '') : '', + isFirst ? (currency ?? '') : '', + item.description, + item.quantity ?? '', + item.unitPrice ?? '', + item.total, + isFirst ? (subtotal ?? '') : '', + isFirst ? (tax ?? '') : '', + isFirst ? (total ?? '') : '', + ]; + }); +} + +function buildCsvReceipt(results: OcrResult[], combined: boolean): ExportArtifact[] { + const headerLine = buildCsvRow(CSV_RECEIPT_HEADER); + + if (combined) { + const dataLines = results.flatMap(r => resultToReceiptRows(r).map(row => buildCsvRow(row))); + const lines = [headerLine, ...dataLines]; + return [{ + blob: new Blob([lines.join('\n')], { type: 'text/csv' }), + fileName: 'ocr-results.csv', + }]; + } + + return results.map(r => { + const dataLines = resultToReceiptRows(r).map(row => buildCsvRow(row)); + const lines = [headerLine, ...dataLines]; + return { + blob: new Blob([lines.join('\n')], { type: 'text/csv' }), + fileName: sanitizeBase(r.fileName) + '.csv', + }; + }); +} + +// ── XLSX free-form ──────────────────────────────────────────────────────────── + +async function buildXlsxFreeform(results: OcrResult[], combined: boolean): Promise { + // Lazy import to keep the main bundle slim + const XLSX = await import('xlsx'); + + function makeSheet(result: OcrResult) { + const rows: (string | number)[][] = [ + ['fileName', 'confidence', 'text'], + [result.fileName, result.confidence, result.text], + ]; + return XLSX.utils.aoa_to_sheet(rows); + } + + function wbToBlob(wb: ReturnType): Blob { + const buf = XLSX.write(wb, { type: 'array', bookType: 'xlsx' }) as ArrayBuffer; + return new Blob([buf], { + type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }); + } + + if (combined) { + const wb = XLSX.utils.book_new(); + const usedNames = new Set(); + for (const r of results) { + const sheetName = sanitizeSheetName(sanitizeBase(r.fileName), usedNames); + XLSX.utils.book_append_sheet(wb, makeSheet(r), sheetName); + } + return [{ + blob: wbToBlob(wb), + fileName: 'ocr-results.xlsx', + }]; + } + + return results.map(r => { + const wb = XLSX.utils.book_new(); + const usedNames = new Set(); + const sheetName = sanitizeSheetName(sanitizeBase(r.fileName), usedNames); + XLSX.utils.book_append_sheet(wb, makeSheet(r), sheetName); + const buf = XLSX.write(wb, { type: 'array', bookType: 'xlsx' }) as ArrayBuffer; + return { + blob: new Blob([buf], { + type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }), + fileName: sanitizeBase(r.fileName) + '.xlsx', + }; + }); +} + +// ── XLSX receipt helpers ────────────────────────────────────────────────────── + +/** + * Builds the top header block for receipt XLSX sheets. + * Returns array-of-arrays: [label, value] pairs for merchant/date/currency/subtotal/tax/total. + */ +function buildReceiptHeaderRows(parsed: ReceiptData): (string | number | undefined)[][] { + return [ + ['Merchant', parsed.merchant ?? ''], + ['Date', parsed.date ?? ''], + ['Currency', parsed.currency ?? ''], + ['Subtotal', parsed.subtotal ?? ''], + ['Tax', parsed.tax ?? ''], + ['Total', parsed.total ?? ''], + ]; +} + +/** + * Builds the item rows for receipt XLSX sheets. + * Returns: [Description, Quantity, Unit Price, Line Total] + */ +function buildItemRows(parsed: ReceiptData): (string | number | undefined)[][] { + return parsed.items.map(item => [ + item.description, + item.quantity ?? '', + item.unitPrice ?? '', + item.total, + ]); +} + +// ── XLSX receipt export ─────────────────────────────────────────────────────── + +async function buildXlsxReceipt(results: OcrResult[], combined: boolean): Promise { + const XLSX = await import('xlsx'); + + function makeReceiptSheet(result: OcrResult) { + const rows: (string | number | undefined)[][] = []; + + if (!result.parsed) { + // No parsed data — emit a minimal sheet with just the filename + rows.push(['File', result.fileName]); + rows.push(['(no receipt data parsed)']); + return XLSX.utils.aoa_to_sheet(rows); + } + + // Header block: rows 1–6 (0-indexed 0–5) + const headerRows = buildReceiptHeaderRows(result.parsed); + rows.push(...headerRows); + + // Blank row (row 7, 0-indexed 6) + rows.push([]); + + // Item table header (row 8, 0-indexed 7) + rows.push(['Description', 'Quantity', 'Unit Price', 'Line Total']); + + // Item data rows + const itemRows = buildItemRows(result.parsed); + rows.push(...itemRows); + + return XLSX.utils.aoa_to_sheet(rows); + } + + function wbToBlob(wb: ReturnType): Blob { + const buf = XLSX.write(wb, { type: 'array', bookType: 'xlsx' }) as ArrayBuffer; + return new Blob([buf], { + type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }); + } + + if (combined) { + const wb = XLSX.utils.book_new(); + const usedNames = new Set(); + for (const r of results) { + const sheetName = sanitizeSheetName(sanitizeBase(r.fileName), usedNames); + XLSX.utils.book_append_sheet(wb, makeReceiptSheet(r), sheetName); + } + return [{ + blob: wbToBlob(wb), + fileName: 'ocr-results.xlsx', + }]; + } + + return results.map(r => { + const wb = XLSX.utils.book_new(); + const usedNames = new Set(); + const sheetName = sanitizeSheetName(sanitizeBase(r.fileName), usedNames); + XLSX.utils.book_append_sheet(wb, makeReceiptSheet(r), sheetName); + const buf = XLSX.write(wb, { type: 'array', bookType: 'xlsx' }) as ArrayBuffer; + return { + blob: new Blob([buf], { + type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + }), + fileName: sanitizeBase(r.fileName) + '.xlsx', + }; + }); +} + +// ── Public API ──────────────────────────────────────────────────────────────── + +export async function buildExport( + results: OcrResult[], + format: ExportFormat, + opts?: ExportOptions +): Promise { + const combined = opts?.combined ?? false; + const receiptMode = opts?.receiptMode ?? false; + + switch (format) { + case 'txt': + return buildTxt(results, combined); + + case 'csv': + return receiptMode + ? buildCsvReceipt(results, combined) + : buildCsvFreeform(results, combined); + + case 'xlsx': + return receiptMode + ? buildXlsxReceipt(results, combined) + : buildXlsxFreeform(results, combined); + + default: + throw new Error(`Unknown export format: ${format as string}`); + } +} diff --git a/src/lib/ocr-types.ts b/src/lib/ocr-types.ts new file mode 100644 index 0000000..5213870 --- /dev/null +++ b/src/lib/ocr-types.ts @@ -0,0 +1,42 @@ +export type ExportFormat = 'txt' | 'csv' | 'xlsx'; + +export interface OcrWord { + text: string; + confidence: number; + bbox?: [number, number, number, number]; +} + +export interface LineItem { + description: string; + quantity?: number; + unitPrice?: number; + total: number; +} + +export interface ReceiptData { + merchant?: string; + date?: string; + items: LineItem[]; + subtotal?: number; + tax?: number; + total?: number; + currency?: string; +} + +export interface OcrResult { + readonly id: string; + readonly fileName: string; + readonly text: string; + readonly confidence: number; + readonly words?: OcrWord[]; + readonly parsed?: ReceiptData; + readonly error?: string; +} + +export interface OcrProgress { + fileId: string; + phase: string; + ratio: number; +} + +export type OcrProgressFn = (p: OcrProgress) => void; diff --git a/src/lib/ocr.test.ts b/src/lib/ocr.test.ts new file mode 100644 index 0000000..4d0d40a --- /dev/null +++ b/src/lib/ocr.test.ts @@ -0,0 +1,102 @@ +import { describe, it, expect, beforeEach, vi } from 'vitest'; +import { runOcr, terminateOcr } from './ocr'; + +// Pull the mocked module so we can inspect call counts +const tesseract = await import('tesseract.js'); +const mockedCreateWorker = vi.mocked(tesseract.createWorker); + +function makeFile(name = 'test.png'): File { + return new File(['fake-image-data'], name, { type: 'image/png' }); +} + +describe('runOcr', () => { + beforeEach(async () => { + await terminateOcr(); + vi.clearAllMocks(); + }); + + it('returns OcrResult with text, confidence, words, id, and fileName', async () => { + const file = makeFile('receipt.png'); + const result = await runOcr(file, 'id-1'); + + expect(result.id).toBe('id-1'); + expect(result.fileName).toBe('receipt.png'); + expect(result.text).toBe('MOCKED OCR TEXT'); + expect(result.confidence).toBe(95); + expect(Array.isArray(result.words)).toBe(true); + }); + + it('calls onProgress at least once with a progress object', async () => { + const file = makeFile('img.png'); + const progressCalls: Array<{ fileId: string; phase: string; ratio: number }> = []; + + await runOcr(file, 'id-progress', { + onProgress: (p) => progressCalls.push(p), + }); + + expect(progressCalls.length).toBeGreaterThan(0); + const call = progressCalls[0]; + expect(call.fileId).toBe('id-progress'); + expect(typeof call.phase).toBe('string'); + expect(typeof call.ratio).toBe('number'); + }); + + it('maps words with bbox as [x0,y0,x1,y1] tuple', async () => { + const file = makeFile('img.png'); + const result = await runOcr(file, 'id-words'); + + expect(result.words).toBeDefined(); + expect(result.words!.length).toBe(2); + + const firstWord = result.words![0]; + expect(firstWord.text).toBe('MOCKED'); + expect(firstWord.bbox).toEqual([0, 0, 10, 10]); + + const secondWord = result.words![1]; + expect(secondWord.text).toBe('OCR'); + expect(secondWord.bbox).toEqual([12, 0, 20, 10]); + }); + + it('reuses the worker across sequential runOcr calls (createWorker called once)', async () => { + const file1 = makeFile('a.png'); + const file2 = makeFile('b.png'); + + await runOcr(file1, 'id-a'); + await runOcr(file2, 'id-b'); + + expect(mockedCreateWorker).toHaveBeenCalledTimes(1); + }); + + it('text passthrough matches mock value', async () => { + const file = makeFile('pass.png'); + const result = await runOcr(file, 'id-pass'); + expect(result.text).toBe('MOCKED OCR TEXT'); + }); +}); + +describe('terminateOcr', () => { + beforeEach(async () => { + await terminateOcr(); + vi.clearAllMocks(); + }); + + it('calls worker.terminate() and resets cache so next runOcr re-creates the worker', async () => { + const file = makeFile('t.png'); + + // First run — creates worker + await runOcr(file, 'id-t1'); + expect(mockedCreateWorker).toHaveBeenCalledTimes(1); + + // Get the worker instance to check terminate was called + const workerInstance = await mockedCreateWorker.mock.results[0].value; + const terminateSpy = workerInstance.terminate; + + await terminateOcr(); + expect(terminateSpy).toHaveBeenCalledTimes(1); + + // Second run after terminate — should create a new worker + vi.clearAllMocks(); + await runOcr(file, 'id-t2'); + expect(mockedCreateWorker).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/lib/ocr.ts b/src/lib/ocr.ts new file mode 100644 index 0000000..c6697c3 --- /dev/null +++ b/src/lib/ocr.ts @@ -0,0 +1,77 @@ +import type { OcrResult, OcrProgressFn } from './ocr-types'; + +type TesseractWorker = { + recognize: (img: File | Blob) => Promise<{ + data: { + text: string; + confidence: number; + words?: Array<{ + text: string; + confidence: number; + bbox: { x0: number; y0: number; x1: number; y1: number }; + }>; + }; + }>; + terminate: () => Promise; +}; + +let workerPromise: Promise | null = null; +let activeLogger: OcrProgressFn | null = null; +let activeFileId: string | null = null; + +async function getWorker(lang: string): Promise { + if (!workerPromise) { + const { createWorker } = await import('tesseract.js'); + workerPromise = createWorker(lang, 1, { + logger: (m: { status: string; progress: number }) => { + if (activeLogger && activeFileId) { + activeLogger({ fileId: activeFileId, phase: m.status, ratio: m.progress }); + } + }, + }) as unknown as Promise; + } + return workerPromise; +} + +export async function runOcr( + file: File, + fileId: string, + opts?: { lang?: string; onProgress?: OcrProgressFn; signal?: AbortSignal } +): Promise { + const lang = opts?.lang ?? 'eng'; + activeLogger = opts?.onProgress ?? null; + activeFileId = fileId; + try { + if (opts?.signal?.aborted) throw new Error('OCR aborted'); + const worker = await getWorker(lang); + if (opts?.signal?.aborted) throw new Error('OCR aborted'); + const { data } = await worker.recognize(file); + const words = (data.words ?? []).map(w => ({ + text: w.text, + confidence: w.confidence, + bbox: [w.bbox.x0, w.bbox.y0, w.bbox.x1, w.bbox.y1] as [number, number, number, number], + })); + return { + id: fileId, + fileName: file.name, + text: data.text, + confidence: data.confidence, + words, + }; + } finally { + activeLogger = null; + activeFileId = null; + } +} + +export async function terminateOcr(): Promise { + if (!workerPromise) return; + try { + const worker = await workerPromise; + await worker.terminate(); + } catch { + // ignore termination errors + } finally { + workerPromise = null; + } +} diff --git a/src/lib/receipt-parser.test.ts b/src/lib/receipt-parser.test.ts new file mode 100644 index 0000000..9cb3cf0 --- /dev/null +++ b/src/lib/receipt-parser.test.ts @@ -0,0 +1,184 @@ +import { describe, it, expect } from 'vitest'; +import { parseReceipt } from './receipt-parser'; + +describe('parseReceipt', () => { + // Case 1: Two-column US receipt + it('parses a full two-column US receipt', () => { + const text = ` +WHOLE FOODS MARKET +123 Main St, Springfield +Date: 03/14/2025 + +Organic Milk 3.99 +Free Range Eggs 5.49 +Sourdough Bread 4.25 + +Subtotal 13.73 +Tax 1.10 +Total 14.83 +`.trim(); + const r = parseReceipt(text); + expect(r.merchant).toBe('WHOLE FOODS MARKET'); + expect(r.date).toBeTruthy(); + expect(r.items).toHaveLength(3); + expect(r.items[0].description).toContain('Organic Milk'); + expect(r.items[0].total).toBeCloseTo(3.99); + expect(r.items[1].total).toBeCloseTo(5.49); + expect(r.items[2].total).toBeCloseTo(4.25); + expect(r.subtotal).toBeCloseTo(13.73); + expect(r.tax).toBeCloseTo(1.10); + expect(r.total).toBeCloseTo(14.83); + }); + + // Case 2: European receipt with comma decimals + it('parses European receipt with comma decimals', () => { + const text = ` +Bäckerei Schmidt +Datum: 14.03.2025 + +Brot 3,50 +Croissant 2,20 +Kaffee 1,80 + +Gesamt 7,50 +`.trim(); + const r = parseReceipt(text); + expect(r.items.length).toBeGreaterThanOrEqual(3); + const brot = r.items.find(i => i.description.toLowerCase().includes('brot')); + expect(brot).toBeDefined(); + expect(brot!.total).toBeCloseTo(3.5); + const croissant = r.items.find(i => i.description.toLowerCase().includes('croissant')); + expect(croissant!.total).toBeCloseTo(2.2); + }); + + // Case 3: Receipt missing explicit "total" — falls back to sum of items + it('falls back to sum of items when total line is absent', () => { + const text = ` +Corner Deli +Item A 5.00 +Item B 3.00 +Item C 2.00 +`.trim(); + const r = parseReceipt(text); + expect(r.total).toBeCloseTo(10.0); + expect(r.items).toHaveLength(3); + }); + + // Case 4: Quantity prefix lines + it('parses quantity-prefixed lines', () => { + const text = ` +Cafe Nero +2 x Coffee 6.00 +3 x Muffin 7.50 +Total 13.50 +`.trim(); + const r = parseReceipt(text); + const coffee = r.items.find(i => i.description.toLowerCase().includes('coffee')); + expect(coffee).toBeDefined(); + expect(coffee!.quantity).toBe(2); + expect(coffee!.unitPrice).toBeCloseTo(3.0); + expect(coffee!.total).toBeCloseTo(6.0); + const muffin = r.items.find(i => i.description.toLowerCase().includes('muffin')); + expect(muffin!.quantity).toBe(3); + expect(muffin!.unitPrice).toBeCloseTo(2.5); + }); + + // Case 5: All-caps merchant + it('detects all-caps merchant name', () => { + const text = ` +WALMART +2025-01-10 +Bananas 0.99 +Total 0.99 +`.trim(); + const r = parseReceipt(text); + expect(r.merchant).toBe('WALMART'); + }); + + // Case 6: Currency symbol detection and normalization + it('detects and normalizes currency symbols', () => { + const usd = parseReceipt(`Store\nItem $5.00\nTotal $5.00`); + expect(usd.currency).toBe('USD'); + + const eur = parseReceipt(`Laden\nArtikel €3,50\nGesamt €3,50`); + expect(eur.currency).toBe('EUR'); + + const gbp = parseReceipt(`Shop\nItem £4.99\nTotal £4.99`); + expect(gbp.currency).toBe('GBP'); + }); + + // Case 7: Date in multiple formats + it('parses dates in multiple formats', () => { + const iso = parseReceipt(`Shop\n2025-03-14\nItem 1.00\nTotal 1.00`); + expect(iso.date).toBe('2025-03-14'); + + const slash = parseReceipt(`Shop\n03/14/2025\nItem 1.00\nTotal 1.00`); + expect(slash.date).toBeTruthy(); + expect(slash.date).toContain('03'); + + const wordy = parseReceipt(`Shop\nMar 14, 2025\nItem 1.00\nTotal 1.00`); + expect(wordy.date).toBeTruthy(); + expect(wordy.date!.toLowerCase()).toContain('mar'); + }); + + // Case 8: Tax + subtotal + total trio — never confuse subtotal with total + it('parses subtotal, tax, and total separately', () => { + const text = ` +Grocery Plus +Apples 2.00 +Oranges 3.00 +Sub-Total 5.00 +Tax 0.40 +Total 5.40 +`.trim(); + const r = parseReceipt(text); + expect(r.subtotal).toBeCloseTo(5.0); + expect(r.tax).toBeCloseTo(0.40); + expect(r.total).toBeCloseTo(5.40); + // Ensure total != subtotal + expect(r.total).not.toEqual(r.subtotal); + }); + + // Case 9: Negative discount line + it('parses negative discount lines', () => { + const text = ` +Supermart +Pasta 2.50 +Discount -2.00 +Total 0.50 +`.trim(); + const r = parseReceipt(text); + const discount = r.items.find(i => i.description.toLowerCase().includes('discount')); + expect(discount).toBeDefined(); + expect(discount!.total).toBeCloseTo(-2.0); + }); + + // Case 10: Noisy OCR with stray punctuation + it('extracts amounts despite stray OCR punctuation', () => { + const text = ` +Noisy Store +Item..One....5.99 +Another..Item...3,00. +Total...9.99.. +`.trim(); + const r = parseReceipt(text); + expect(r.total).toBeCloseTo(9.99); + expect(r.items.length).toBeGreaterThanOrEqual(1); + }); + + // Case 11: Empty input + it('returns empty items for empty input', () => { + const r = parseReceipt(''); + expect(r.items).toEqual([]); + expect(r.total).toBeUndefined(); + expect(r.merchant).toBeUndefined(); + }); + + // Case 12: Total-only receipt — total set, items empty, total NOT auto-computed + it('sets total without auto-computing when no items found', () => { + const text = `Total 42.00`; + const r = parseReceipt(text); + expect(r.total).toBeCloseTo(42.0); + expect(r.items).toHaveLength(0); + }); +}); diff --git a/src/lib/receipt-parser.ts b/src/lib/receipt-parser.ts new file mode 100644 index 0000000..3a125f8 --- /dev/null +++ b/src/lib/receipt-parser.ts @@ -0,0 +1,180 @@ +import type { LineItem, ReceiptData } from './ocr-types'; + +// ── Constants ──────────────────────────────────────────────────────────────── + +const CURRENCY_MAP: Record = { + '$': 'USD', + '€': 'EUR', + '£': 'GBP', +}; + +// Matches optional leading minus, digits with optional thousand-groups, decimal part +const AMOUNT_RE = /-?\d{1,3}(?:[.,]\d{3})*[.,]\d{2}/g; + +const DATE_PATTERNS = [ + /\b(\d{4})-(\d{1,2})-(\d{1,2})\b/, + /\b(\d{1,2})[/.\-](\d{1,2})[/.\-](\d{2,4})\b/, + /\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\.?\s+\d{1,2},?\s+\d{2,4}\b/i, +]; + +const TOTAL_KEYS = /\b(?:grand\s*total|total\s*due|amount\s*due|total|balance)\b/i; +const SUBTOTAL_KEYS = /\b(?:sub[-\s]?total)\b/i; +const TAX_KEYS = /\b(?:tax|vat|gst|hst)\b/i; +const QTY_PREFIX = /^(\d+)\s*[xX@]\s+/; + +// ── Helpers ────────────────────────────────────────────────────────────────── + +function normaliseAmount(raw: string): number { + const lastComma = raw.lastIndexOf(','); + const lastDot = raw.lastIndexOf('.'); + + let normalised: string; + + if (lastComma !== -1 && lastDot !== -1) { + if (lastDot > lastComma) { + normalised = raw.replace(/,/g, ''); + } else { + normalised = raw.replace(/\./g, '').replace(',', '.'); + } + } else if (lastComma !== -1 && lastDot === -1) { + normalised = raw.replace(',', '.'); + } else { + normalised = raw; + } + + return parseFloat(normalised); +} + +function extractLastAmount(line: string): number | null { + AMOUNT_RE.lastIndex = 0; + const matches = line.match(AMOUNT_RE); + if (!matches || matches.length === 0) return null; + const raw = matches[matches.length - 1]; + const val = normaliseAmount(raw); + return isFinite(val) ? val : null; +} + +// ── Main function ───────────────────────────────────────────────────────────── + +export function parseReceipt(text: string): ReceiptData { + if (!text || !text.trim()) return { items: [] }; + + const lines = text.split('\n').map(l => l.trim()).filter(Boolean); + + // 1. Detect currency + let currency: string | undefined; + const symbolMatch = text.match(/[$€£]/); + if (symbolMatch) { + currency = CURRENCY_MAP[symbolMatch[0]]; + } else { + const codeMatch = text.match(/\b(USD|EUR|GBP|CAD|AUD)\b/); + if (codeMatch) currency = codeMatch[1]; + } + + // 2. Detect merchant: first line with >=3 chars containing a letter, + // not starting with a digit, not a date/total/subtotal/tax line + let merchant: string | undefined; + for (const line of lines) { + if ( + line.length >= 3 && + /[a-zA-Z]/.test(line) && + !/^\d/.test(line) && + !DATE_PATTERNS.some(p => p.test(line)) && + !TOTAL_KEYS.test(line) && + !SUBTOTAL_KEYS.test(line) && + !TAX_KEYS.test(line) + ) { + merchant = line.slice(0, 60); + break; + } + } + + // 3. Detect date + let date: string | undefined; + outer: + for (const line of lines) { + for (let i = 0; i < DATE_PATTERNS.length; i++) { + const m = DATE_PATTERNS[i].exec(line); + if (m) { + date = m[0]; + break outer; + } + } + } + + // 4. Process lines + const items: LineItem[] = []; + let subtotal: number | undefined; + let tax: number | undefined; + let total: number | undefined; + + for (const rawLine of lines) { + // Strip currency symbols for amount parsing, keep for description extraction + const line = rawLine.replace(/[$€£]/g, '').trim(); + + const amount = extractLastAmount(line); + if (amount === null) continue; + + if (SUBTOTAL_KEYS.test(line)) { + subtotal = amount; + continue; + } + if (TAX_KEYS.test(line)) { + tax = amount; + continue; + } + if (TOTAL_KEYS.test(line)) { + total = amount; // last wins + continue; + } + + // It's a line item — extract description + AMOUNT_RE.lastIndex = 0; + const amountMatches = line.match(AMOUNT_RE); + if (!amountMatches) continue; + + const lastMatch = amountMatches[amountMatches.length - 1]; + const matchIndex = line.lastIndexOf(lastMatch); + let description = line.slice(0, matchIndex).trim(); + + if (!description) continue; + if (TOTAL_KEYS.test(description) || SUBTOTAL_KEYS.test(description) || TAX_KEYS.test(description)) continue; + if (merchant && description.toLowerCase() === merchant.toLowerCase()) continue; + + // Strip trailing dots/periods from description (OCR noise) + description = description.replace(/[.\s]+$/, '').trim(); + if (!description) continue; + + let quantity: number | undefined; + let unitPrice: number | undefined; + const qtyMatch = QTY_PREFIX.exec(description); + if (qtyMatch) { + quantity = parseInt(qtyMatch[1], 10); + description = description.slice(qtyMatch[0].length).trim(); + unitPrice = Math.round((amount / quantity) * 100) / 100; + } + + const item: LineItem = { description, total: amount }; + if (quantity !== undefined) item.quantity = quantity; + if (unitPrice !== undefined) item.unitPrice = unitPrice; + + items.push(item); + } + + // 5. Fall back: compute total from items if missing and items exist + if (total === undefined && items.length > 0) { + const sum = items.reduce((acc, i) => acc + i.total, 0); + total = Math.round(sum * 100) / 100; + } + + // 6. Assemble result + const result: ReceiptData = { items }; + if (merchant !== undefined) result.merchant = merchant; + if (date !== undefined) result.date = date; + if (currency !== undefined) result.currency = currency; + if (subtotal !== undefined) result.subtotal = subtotal; + if (tax !== undefined) result.tax = tax; + if (total !== undefined) result.total = total; + + return result; +} diff --git a/src/main.ts b/src/main.ts index 0535c07..64b50e6 100644 --- a/src/main.ts +++ b/src/main.ts @@ -13,8 +13,13 @@ import { import { buildConvertedFileName, buildZipFileName, formatFileSize, validateFiles } from './lib/files'; import { convertImageFile, createPreviewUrl, revokeObjectUrl } from './lib/convert'; import { createZipBlob, downloadBlob } from './lib/downloads'; +import { runOcr, terminateOcr } from './lib/ocr'; +import { parseReceipt } from './lib/receipt-parser'; +import { buildExport } from './lib/export'; +import type { OcrResult, ExportFormat } from './lib/ocr-types'; type NoticeTone = 'neutral' | 'error' | 'success'; +type Mode = 'convert' | 'ocr'; interface AppNotice { readonly tone: NoticeTone; @@ -38,6 +43,9 @@ interface SelectedItem { type WorkflowStage = 'upload' | 'convert' | 'download'; +// Track last-rendered progress per item for throttling +const lastRenderedProgress: Record = {}; + const appRoot = document.querySelector('#app'); if (!appRoot) { @@ -60,6 +68,15 @@ const state = { isConverting: false, dragActive: false, progressLabel: '', + mode: 'convert' as Mode, + ocr: { + results: {} as Record, + progress: {} as Record, + isRunning: false, + exportFormat: 'txt' as ExportFormat, + receiptMode: false, + combined: true, + }, }; function escapeHtml(value: string): string { @@ -260,7 +277,272 @@ function renderFileCards(): string { .join(''); } +// ── OCR render helpers ──────────────────────────────────────────────────────── + +function confidenceLevel(confidence: number): 'high' | 'mid' | 'low' { + if (confidence >= 85) return 'high'; + if (confidence >= 60) return 'mid'; + return 'low'; +} + +function renderOcrCards(): string { + if (state.items.length === 0) { + return ` +
+

No images loaded yet

+

Drop images here or use the upload button, then run OCR.

+
+ `; + } + + return state.items + .map((item) => { + const result = state.ocr.results[item.id]; + const progress = state.ocr.progress[item.id] ?? 0; + const isRunning = state.ocr.isRunning; + + const mediaMarkup = item.previewUrl + ? `${escapeHtml(item.file.name)} preview` + : `
${escapeHtml(formatKeyToLabel(item.sourceFormat))}
`; + + let statusBlock = ''; + if (result?.error) { + statusBlock = `

${escapeHtml(result.error)}

`; + } else if (result) { + const level = confidenceLevel(result.confidence); + const confLabel = level === 'high' ? 'High' : level === 'mid' ? 'Mid' : 'Low'; + + let receiptSummary = ''; + if (state.ocr.receiptMode && result.parsed) { + const p = result.parsed; + const itemCount = p.items.length; + receiptSummary = ` +
+ ${p.merchant ? `${escapeHtml(p.merchant)}` : ''} + ${p.date ? `${escapeHtml(p.date)}` : ''} + ${p.total !== undefined ? `Total: ${p.currency ? escapeHtml(p.currency) + ' ' : ''}${p.total.toFixed(2)}` : ''} + ${itemCount} line item${itemCount === 1 ? '' : 's'} +
+ `; + } + + statusBlock = ` +
+ ${confLabel} (${Math.round(result.confidence)}%) +
+ ${receiptSummary} + + `; + } else if (isRunning && progress > 0) { + statusBlock = `

Processing…

`; + } else { + statusBlock = `

Ready to run OCR

`; + } + + const progressPct = Math.round(progress * 100); + const progressBar = (isRunning || progress > 0) + ? `
+
+
` + : ''; + + const downloadBtn = result && !result.error + ? `` + : ''; + + return ` +
+
+ ${mediaMarkup} +
+
+
+ ${escapeHtml(item.file.name)} + ${escapeHtml(formatFileSize(item.file.size))} +
+ ${progressBar} + ${statusBlock} +
+ ${downloadBtn} + +
+
+
+ `; + }) + .join(''); +} + +function renderOcrControlPanel(): string { + const completedResults = Object.values(state.ocr.results).filter(r => !r.error); + const hasResults = completedResults.length > 0; + const runDisabled = state.items.length === 0 || state.ocr.isRunning; + const downloadAllDisabled = !hasResults || state.ocr.isRunning; + + return ` +
+
+ OCR setup +

Extract text from images locally in your browser.

+
+ +
+ + + + + +
+ +
+
+ Loaded + ${state.items.length} +
+
+ Done + ${completedResults.length} +
+
+ Status + ${state.ocr.isRunning ? 'Running' : hasResults ? 'Ready' : 'Idle'} +
+
+ + + + + + +
+ `; +} + +function renderModeToggle(): string { + return ` +
+ + +
+ `; +} + function render(): void { + if (state.mode === 'convert') { + renderConvertMode(); + } else { + renderOcrMode(); + } +} + +function renderOcrMode(): void { + app.innerHTML = ` + +
+ ${renderModeToggle()} + +
+
+ 100% local, in-browser OCR +

Extract text from images without leaving your browser.

+

+ Drop image files below, run OCR, then edit and download the results as plain text, CSV, or Excel. + Optionally enable Receipt mode to parse merchant names, totals and line items. +

+
+ +
+
+ + ${renderOcrControlPanel()} +
+ +
+ +
+ + Step 1 · upload +

Drop images here or choose them from your device.

+

Upload your images, then click Run OCR to extract text.

+ +
+

No ZIP uploads. Unsupported files are rejected.

+

Drag and drop works for single images and batches.

+
+
+
+ +
+ ${renderNotices()} +
+ +
+ ${renderOcrCards()} +
+
+ `; + + // Populate textareas after innerHTML to avoid injection issues + for (const item of state.items) { + const result = state.ocr.results[item.id]; + if (!result || result.error) continue; + const textarea = app.querySelector( + `textarea[data-action="edit-ocr-text"][data-id="${CSS.escape(item.id)}"]`, + ); + if (textarea) { + textarea.value = result.text; + } + } + + bindEvents(); +} + +function renderConvertMode(): void { const convertedCount = state.items.filter((item) => item.converted).length; const destinationLabel = DESTINATION_OPTIONS.find((option) => option.key === state.destination)?.label ?? 'JPG'; @@ -273,6 +555,8 @@ function render(): void { app.innerHTML = `
+ ${renderModeToggle()} +
100% local, in-browser conversion @@ -500,10 +784,14 @@ async function loadFiles(files: File[]): Promise { state.items = preparedItems; state.isPreparing = false; + // Clear OCR state when new files are loaded + state.ocr.results = {}; + state.ocr.progress = {}; + const notices: AppNotice[] = [ { tone: 'success', - text: `${preparedItems.length} file${preparedItems.length === 1 ? '' : 's'} ready for conversion.`, + text: `${preparedItems.length} file${preparedItems.length === 1 ? '' : 's'} ready${state.mode === 'ocr' ? ' for OCR' : ' for conversion'}.`, }, ...errors.map((error) => ({ tone: 'error' as const, text: error })), ]; @@ -594,59 +882,119 @@ async function downloadZip(): Promise { downloadBlob(zipBlob, buildZipFileName(state.destination)); } +// ── OCR logic ───────────────────────────────────────────────────────────────── + +function shouldRenderOcrProgress(itemId: string, ratio: number): boolean { + const last = lastRenderedProgress[itemId] ?? -1; + return ratio - last >= 0.05 || ratio >= 1; +} + +async function runOcrAll(): Promise { + if (state.items.length === 0 || state.ocr.isRunning) return; + + state.ocr.isRunning = true; + render(); + + const runOcrFn: typeof runOcr = (window as any).__ocrTestMode + ? async (file: File, id: string, _opts?: Parameters[2]): Promise => ({ + id, + fileName: file.name, + text: (window as any).__ocrTestMode.text ?? 'TEST RECEIPT\n2025-03-14\nCoffee 4.50\nMuffin 3.25\nTotal 7.75', + confidence: 99, + words: [], + }) + : runOcr; + + for (const item of state.items) { + try { + const result = await runOcrFn(item.file, item.id, { + onProgress: ({ ratio }) => { + state.ocr.progress[item.id] = ratio; + if (shouldRenderOcrProgress(item.id, ratio)) { + lastRenderedProgress[item.id] = ratio; + render(); + } + }, + }); + const parsed = state.ocr.receiptMode ? parseReceipt(result.text) : undefined; + state.ocr.results[item.id] = parsed !== undefined ? { ...result, parsed } : result; + state.ocr.progress[item.id] = 1; + } catch (e) { + state.ocr.results[item.id] = { + id: item.id, + fileName: item.file.name, + text: '', + confidence: 0, + error: e instanceof Error ? e.message : String(e), + }; + setNotices([{ tone: 'error', text: `OCR failed for ${item.file.name}: ${String(e)}` }]); + } + render(); + } + + state.ocr.isRunning = false; + const doneCount = Object.values(state.ocr.results).filter(r => !r.error).length; + setNotices([{ + tone: doneCount > 0 ? 'success' : 'error', + text: `OCR complete. ${doneCount} of ${state.items.length} file${state.items.length === 1 ? '' : 's'} processed successfully.`, + }]); + render(); +} + +async function downloadOcrSingle(itemId: string): Promise { + const result = state.ocr.results[itemId]; + if (!result || result.error) return; + + const artifacts = await buildExport( + [result], + state.ocr.exportFormat, + { combined: false, receiptMode: state.ocr.receiptMode }, + ); + + if (artifacts.length === 1) { + downloadBlob(artifacts[0].blob, artifacts[0].fileName); + } else { + const zip = await createZipBlob(artifacts.map(a => ({ fileName: a.fileName, blob: a.blob }))); + downloadBlob(zip, `ocr-${Date.now()}.zip`); + } +} + +async function downloadOcrAll(): Promise { + const results = Object.values(state.ocr.results).filter(r => !r.error); + if (results.length === 0) return; + + const artifacts = await buildExport( + results, + state.ocr.exportFormat, + { combined: state.ocr.combined, receiptMode: state.ocr.receiptMode }, + ); + + if (artifacts.length === 1) { + downloadBlob(artifacts[0].blob, artifacts[0].fileName); + } else { + const zip = await createZipBlob(artifacts.map(a => ({ fileName: a.fileName, blob: a.blob }))); + downloadBlob(zip, `ocr-results-${Date.now()}.zip`); + } +} + +// ── Event binding ───────────────────────────────────────────────────────────── + function bindEvents(): void { const fileInput = document.querySelector('#file-input'); - const convertButton = document.querySelector('#convert-files'); - const downloadZipButton = document.querySelector('#download-zip'); - const clearFilesButton = document.querySelector('#clear-files'); - const sourceSelect = document.querySelector('#source-filter'); - const destinationSelect = document.querySelector('#destination-format'); const dropzone = document.querySelector('#dropzone'); const openFileButtons = document.querySelectorAll('[data-action="open-files"]'); - if ( - !fileInput || - !convertButton || - !downloadZipButton || - !clearFilesButton || - !sourceSelect || - !destinationSelect || - !dropzone || - openFileButtons.length === 0 - ) { + if (!fileInput || !dropzone || openFileButtons.length === 0) { return; } + // Shared: file input + dropzone openFileButtons.forEach((button) => { button.addEventListener('click', (event) => { event.stopPropagation(); fileInput.click(); }); }); - convertButton.addEventListener('click', () => { - void convertAll(); - }); - downloadZipButton.addEventListener('click', () => { - void downloadZip(); - }); - clearFilesButton.addEventListener('click', () => { - resetItems(); - setNotices([ - { - tone: 'neutral', - text: 'Selection cleared.', - }, - ]); - render(); - }); - - sourceSelect.addEventListener('change', (event) => { - onSourceChange((event.currentTarget as HTMLSelectElement).value as SourceFormatKey); - }); - - destinationSelect.addEventListener('change', (event) => { - onDestinationChange((event.currentTarget as HTMLSelectElement).value as OutputFormatKey); - }); fileInput.addEventListener('change', async () => { const selectedFiles = Array.from(fileInput.files ?? []); @@ -673,7 +1021,6 @@ function bindEvents(): void { if (nextTarget instanceof Node && dropzone.contains(nextTarget)) { return; } - if (state.dragActive) { state.dragActive = false; render(); @@ -686,22 +1033,207 @@ function bindEvents(): void { void loadFiles(droppedFiles); }); + // Mode toggle (present in both modes) + document.querySelectorAll('[data-action="set-mode"]').forEach((button) => { + button.addEventListener('click', () => { + const mode = button.dataset.mode as Mode | undefined; + if (mode && mode !== state.mode) { + state.mode = mode; + render(); + } + }); + }); + + if (state.mode === 'convert') { + bindConvertEvents(); + } else { + bindOcrEvents(); + } +} + +function bindConvertEvents(): void { + const convertButton = document.querySelector('#convert-files'); + const downloadZipButton = document.querySelector('#download-zip'); + const clearFilesButton = document.querySelector('#clear-files'); + const sourceSelect = document.querySelector('#source-filter'); + const destinationSelect = document.querySelector('#destination-format'); + + if (!convertButton || !downloadZipButton || !clearFilesButton || !sourceSelect || !destinationSelect) { + return; + } + + convertButton.addEventListener('click', () => { + void convertAll(); + }); + downloadZipButton.addEventListener('click', () => { + void downloadZip(); + }); + clearFilesButton.addEventListener('click', () => { + resetItems(); + setNotices([{ tone: 'neutral', text: 'Selection cleared.' }]); + render(); + }); + sourceSelect.addEventListener('change', (event) => { + onSourceChange((event.currentTarget as HTMLSelectElement).value as SourceFormatKey); + }); + destinationSelect.addEventListener('change', (event) => { + onDestinationChange((event.currentTarget as HTMLSelectElement).value as OutputFormatKey); + }); + document.querySelectorAll('[data-action="download-item"]').forEach((button) => { button.addEventListener('click', () => { const id = button.dataset.id; const item = state.items.find((current) => current.id === id); + if (!item?.converted) return; + downloadBlob(item.converted.blob, item.converted.fileName); + }); + }); +} - if (!item?.converted) { - return; +function bindOcrEvents(): void { + // Format select + const formatSelect = document.querySelector('[data-action="set-ocr-format"]'); + if (formatSelect) { + formatSelect.addEventListener('change', () => { + state.ocr.exportFormat = formatSelect.value as ExportFormat; + }); + } + + // Receipt mode toggle + const receiptCheck = document.querySelector('[data-action="toggle-receipt-mode"]'); + if (receiptCheck) { + receiptCheck.addEventListener('change', () => { + state.ocr.receiptMode = receiptCheck.checked; + // Re-parse all existing results + for (const [id, result] of Object.entries(state.ocr.results)) { + if (!result.error) { + const parsed = state.ocr.receiptMode ? parseReceipt(result.text) : undefined; + state.ocr.results[id] = parsed !== undefined + ? { ...result, parsed } + : { ...result, parsed: undefined }; + } } + render(); + }); + } - downloadBlob(item.converted.blob, item.converted.fileName); + // Combined toggle + const combinedCheck = document.querySelector('[data-action="toggle-combined"]'); + if (combinedCheck) { + combinedCheck.addEventListener('change', () => { + state.ocr.combined = combinedCheck.checked; + }); + } + + // Run OCR + const runBtn = document.querySelector('[data-action="run-ocr"]'); + if (runBtn) { + runBtn.addEventListener('click', () => { + void runOcrAll(); + }); + } + + // Download all + const downloadAllBtn = document.querySelector('[data-action="download-ocr-all"]'); + if (downloadAllBtn) { + downloadAllBtn.addEventListener('click', () => { + void downloadOcrAll(); + }); + } + + // Clear results + const clearResultsBtn = document.querySelector('[data-action="clear-ocr-results"]'); + if (clearResultsBtn) { + clearResultsBtn.addEventListener('click', () => { + state.ocr.results = {}; + state.ocr.progress = {}; + setNotices([{ tone: 'neutral', text: 'OCR results cleared.' }]); + render(); + }); + } + + // Per-card: download single + document.querySelectorAll('[data-action="download-ocr-single"]').forEach((button) => { + button.addEventListener('click', () => { + const id = button.dataset.id; + if (!id) return; + void downloadOcrSingle(id); + }); + }); + + // Per-card: remove item + document.querySelectorAll('[data-action="remove-item"]').forEach((button) => { + button.addEventListener('click', () => { + const id = button.dataset.id; + if (!id) return; + const item = state.items.find(i => i.id === id); + if (item) { + revokeObjectUrl(item.previewUrl); + revokeObjectUrl(item.converted?.url); + } + state.items = state.items.filter(i => i.id !== id); + delete state.ocr.results[id]; + delete state.ocr.progress[id]; + delete lastRenderedProgress[id]; + render(); + }); + }); + + // Per-card: edit OCR text (textarea input event) + document.querySelectorAll('[data-action="edit-ocr-text"]').forEach((textarea) => { + textarea.addEventListener('input', () => { + const id = textarea.dataset.id; + if (!id) return; + const existing = state.ocr.results[id]; + if (!existing || existing.error) return; + const newText = textarea.value; + const parsed = state.ocr.receiptMode ? parseReceipt(newText) : undefined; + state.ocr.results[id] = parsed !== undefined + ? { ...existing, text: newText, parsed } + : { ...existing, text: newText, parsed: undefined }; + // Surgically update only the receipt summary to preserve textarea cursor + if (state.ocr.receiptMode) { + const card = textarea.closest('.ocr-card'); + if (card) { + const summaryEl = card.querySelector('.ocr-receipt-summary'); + const updatedResult = state.ocr.results[id]; + if (updatedResult && !updatedResult.error && updatedResult.parsed) { + const p = updatedResult.parsed; + const div = document.createElement('div'); + div.className = 'ocr-receipt-summary'; + if (p.merchant) { + const strong = document.createElement('strong'); + strong.textContent = p.merchant; + div.appendChild(strong); + } + if (p.date) { + const span = document.createElement('span'); + span.textContent = p.date; + div.appendChild(span); + } + if (p.total !== undefined) { + const span = document.createElement('span'); + span.textContent = `Total: ${p.currency ? p.currency + ' ' : ''}${p.total.toFixed(2)}`; + div.appendChild(span); + } + const countSpan = document.createElement('span'); + countSpan.textContent = `${p.items.length} line item${p.items.length === 1 ? '' : 's'}`; + div.appendChild(countSpan); + if (summaryEl) { + summaryEl.replaceWith(div); + } else { + textarea.insertAdjacentElement('beforebegin', div); + } + } + } + } }); }); } window.addEventListener('beforeunload', () => { releaseItemResources(state.items); + void terminateOcr(); }); render(); diff --git a/src/style.css b/src/style.css index dbde9d2..617dac0 100644 --- a/src/style.css +++ b/src/style.css @@ -803,3 +803,235 @@ h2 { scroll-behavior: auto !important; } } + +/* ── Mode toggle ──────────────────────────────────────────────────────────── */ + +.mode-toggle { + display: inline-flex; + align-items: center; + gap: 0; + margin-bottom: 20px; + border: 1px solid var(--line-strong); + border-radius: var(--radius-md); + overflow: hidden; + background: var(--surface-raised); +} + +.mode-toggle__btn { + display: inline-flex; + align-items: center; + justify-content: center; + min-height: 42px; + padding: 0 20px; + border: none; + border-radius: 0; + background: transparent; + color: var(--text-muted); + font-weight: 600; + font-size: 0.92rem; + letter-spacing: -0.01em; + transition: + background-color var(--duration-fast) var(--ease-out), + color var(--duration-fast) var(--ease-out); +} + +.mode-toggle__btn + .mode-toggle__btn { + border-left: 1px solid var(--line-strong); +} + +.mode-toggle__btn[aria-pressed="true"] { + background: var(--accent); + color: var(--button-primary-text); +} + +.mode-toggle__btn:hover:not([aria-pressed="true"]):not(:disabled) { + background: var(--surface-strong); + color: var(--text); + transform: none; +} + +/* ── OCR control panel ────────────────────────────────────────────────────── */ + +.ocr-panel { + display: grid; + gap: 18px; + align-content: start; + padding: 22px; + border: 1px solid var(--line); + border-radius: var(--radius-xl); + background: var(--surface-raised); + box-shadow: var(--shadow-strong); +} + +.ocr-panel__header { + display: grid; + gap: 6px; +} + +.ocr-panel__header p { + color: var(--text-muted); + font-size: 0.95rem; + margin: 0; +} + +.ocr-panel__check { + display: flex; + align-items: flex-start; + gap: 10px; + cursor: pointer; + font-size: 0.92rem; + line-height: 1.45; +} + +.ocr-panel__check input[type="checkbox"] { + flex-shrink: 0; + margin-top: 2px; + accent-color: var(--accent); + width: 16px; + height: 16px; + cursor: pointer; +} + +/* ── OCR cards ────────────────────────────────────────────────────────────── */ + +.ocr-card { + display: grid; + gap: 0; + border: 1px solid var(--line); + border-radius: var(--radius-lg); + background: var(--surface-raised); +} + +.ocr-card__confidence-row { + display: flex; + align-items: center; + gap: 10px; + margin-bottom: 8px; +} + +.ocr-card__text { + width: 100%; + min-height: 12rem; + padding: 12px; + border: 1px solid var(--line); + border-radius: var(--radius-md); + background: var(--surface-strong); + color: var(--text); + font-family: var(--mono); + font-size: 0.88rem; + line-height: 1.55; + resize: vertical; + transition: + border-color var(--duration-fast) var(--ease-out), + box-shadow var(--duration-fast) var(--ease-out); +} + +.ocr-card__text:focus { + outline: 3px solid var(--focus); + outline-offset: 2px; + border-color: var(--line-strong); +} + +.ocr-card__actions { + display: flex; + align-items: center; + gap: 10px; + flex-wrap: wrap; + margin-top: 4px; +} + +.ocr-card__pending { + color: var(--text-muted); + font-size: 0.92rem; + margin: 4px 0; +} + +/* ── Confidence badge ─────────────────────────────────────────────────────── */ + +.confidence-badge { + display: inline-flex; + align-items: center; + min-height: 26px; + padding: 0 10px; + border-radius: var(--radius-sm); + font-family: var(--mono); + font-size: 0.73rem; + font-weight: 700; + letter-spacing: 0.04em; + text-transform: uppercase; + border: 1px solid transparent; +} + +.confidence-badge--high { + background: var(--success-bg); + border-color: rgba(31, 91, 58, 0.24); + color: var(--success); +} + +.confidence-badge--mid { + background: rgba(120, 90, 20, 0.18); + border-color: rgba(180, 140, 40, 0.28); + color: #c9a227; +} + +@media (prefers-color-scheme: light) { + .confidence-badge--mid { + background: rgba(180, 140, 40, 0.1); + color: #7a6010; + } +} + +.confidence-badge--low { + background: var(--error-bg); + border-color: rgba(171, 47, 47, 0.24); + color: var(--error); +} + +/* ── OCR progress bar ─────────────────────────────────────────────────────── */ + +.ocr-progress { + width: 100%; + height: 5px; + border-radius: var(--radius-xs); + overflow: hidden; + background: rgba(40, 75, 99, 0.24); + margin-bottom: 8px; +} + +.ocr-progress__fill { + height: 100%; + border-radius: inherit; + background: var(--accent); + transition: width var(--duration-base) var(--ease-out); +} + +/* ── OCR receipt summary ──────────────────────────────────────────────────── */ + +.ocr-receipt-summary { + display: flex; + flex-wrap: wrap; + align-items: center; + gap: 8px 14px; + padding: 10px 12px; + margin-bottom: 8px; + border: 1px solid var(--line); + border-radius: var(--radius-md); + background: var(--surface-strong); + font-size: 0.9rem; +} + +.ocr-receipt-summary strong { + font-weight: 700; + font-size: 0.94rem; +} + +.ocr-receipt-summary span { + color: var(--text-muted); + font-size: 0.88rem; +} + +/* ── OCR grid layout override ─────────────────────────────────────────────── */ + +.ocr-grid { + grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); +} diff --git a/src/test-setup.ts b/src/test-setup.ts new file mode 100644 index 0000000..a057132 --- /dev/null +++ b/src/test-setup.ts @@ -0,0 +1,29 @@ +import { vi } from 'vitest'; + +// Global mock so no test ever loads the real WASM +vi.mock('tesseract.js', () => ({ + createWorker: vi.fn(async (_lang?: string, _oem?: number, opts?: { logger?: (m: { status: string; progress: number }) => void }) => { + // Simulate progress on init + opts?.logger?.({ status: 'loading tesseract core', progress: 0 }); + opts?.logger?.({ status: 'initialized api', progress: 1 }); + return { + recognize: vi.fn(async () => { + opts?.logger?.({ status: 'recognizing text', progress: 0.5 }); + opts?.logger?.({ status: 'recognizing text', progress: 1 }); + return { + data: { + text: 'MOCKED OCR TEXT', + confidence: 95, + words: [ + { text: 'MOCKED', confidence: 95, bbox: { x0: 0, y0: 0, x1: 10, y1: 10 } }, + { text: 'OCR', confidence: 95, bbox: { x0: 12, y0: 0, x1: 20, y1: 10 } }, + ], + }, + }; + }), + terminate: vi.fn(async () => {}), + setParameters: vi.fn(async () => {}), + reinitialize: vi.fn(async () => {}), + }; + }), +})); diff --git a/tests/ocr.spec.ts b/tests/ocr.spec.ts new file mode 100644 index 0000000..a9714cb --- /dev/null +++ b/tests/ocr.spec.ts @@ -0,0 +1,106 @@ +import { test, expect } from '@playwright/test'; +import path from 'node:path'; +import fs from 'node:fs'; +import os from 'node:os'; + +const FIXTURE_TEXT = + 'WALMART\n2025-03-14\nCoffee 4.50\nMuffin 3.25\nSubtotal 7.75\nTax 0.62\nTotal 8.37'; + +// Tiny 1×1 PNG bytes (valid PNG, accepted by the image validator) +const PIXEL_PNG_B64 = + 'iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII='; + +async function setupTestMode(page: import('@playwright/test').Page, text = FIXTURE_TEXT) { + await page.addInitScript((t: string) => { + (window as any).__ocrTestMode = { text: t }; + }, text); +} + +function writeFixturePng(): string { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'ocr-fixture-')); + const filePath = path.join(dir, 'receipt.png'); + fs.writeFileSync(filePath, Buffer.from(PIXEL_PNG_B64, 'base64')); + return filePath; +} + +test.describe('OCR workflow', () => { + test('extracts text from an image and exports TXT', async ({ page }) => { + await setupTestMode(page); + await page.goto('/'); + + // Switch to OCR mode + await page.click('[data-action="set-mode"][data-mode="ocr"]'); + await expect( + page.locator('[data-action="set-mode"][data-mode="ocr"]'), + ).toHaveAttribute('aria-pressed', 'true'); + + // Upload file via hidden file input + const filePath = writeFixturePng(); + await page.setInputFiles('#file-input', filePath); + + // Wait for file to be loaded (card appears) + await expect(page.locator('.ocr-card')).toBeVisible({ timeout: 10_000 }); + + // Run OCR + await page.click('[data-action="run-ocr"]'); + + // Wait for textarea to be populated with our fixture text + const textarea = page.locator('[data-action="edit-ocr-text"]').first(); + await expect(textarea).toHaveValue(/WALMART/, { timeout: 15_000 }); + await expect(textarea).toHaveValue(/Total\s+8\.37/); + + // Download TXT — "Download all" button + const [download] = await Promise.all([ + page.waitForEvent('download'), + page.click('[data-action="download-ocr-all"]'), + ]); + + const downloadPath = await download.path(); + expect(downloadPath).toBeTruthy(); + const content = fs.readFileSync(downloadPath!, 'utf8'); + expect(content).toContain('WALMART'); + expect(content).toContain('Total 8.37'); + }); + + test('exports CSV in receipt mode with parsed line items', async ({ page }) => { + await setupTestMode(page); + await page.goto('/'); + await page.click('[data-action="set-mode"][data-mode="ocr"]'); + await expect( + page.locator('[data-action="set-mode"][data-mode="ocr"]'), + ).toHaveAttribute('aria-pressed', 'true'); + + // Enable receipt mode via the checkbox + await page.check('[data-action="toggle-receipt-mode"]'); + + // Select CSV format via