diff --git a/.github/workflows/node.js.yml b/.github/workflows/node.js.yml index 547222d94..2657ce255 100644 --- a/.github/workflows/node.js.yml +++ b/.github/workflows/node.js.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: - node-version: [14.x, 16.x, 18.x, 20.x, 22.x] + node-version: [14.x, 16.x, 18.x, 20.x, 22.x, 24.x] steps: - uses: actions/checkout@v3 diff --git a/docs/api.md b/docs/api.md index cc3f82d7d..2cb0ee039 100644 --- a/docs/api.md +++ b/docs/api.md @@ -34,8 +34,12 @@ - `langs` a string to indicate the languages traineddata to download, multiple languages are specified using an array (['eng', 'chi_sim']) - `oem` a enum to indicate the OCR Engine Mode you use - `options` an object of customized options - - `corePath` path to a directory containing **both** `tesseract-core.wasm.js` and `tesseract-core-simd.wasm.js` from [Tesseract.js-core](https://www.npmjs.com/package/tesseract.js-core) package - - Setting `corePath` to a specific `.js` file is **strongly discouraged.** To provide the best performance on all devices, Tesseract.js needs to be able to pick between `tesseract-core.wasm.js` and `tesseract-core-simd.wasm.js`. See [this issue](https://github.com/naptha/tesseract.js/issues/735) for more detail. + - `corePath` path to a directory containing **all of** the following files from [Tesseract.js-core](https://www.npmjs.com/package/tesseract.js-core) package: + - `tesseract-core.wasm.js` + - `tesseract-core-simd.wasm.js` + - `tesseract-core-lstm.wasm.js` + - `tesseract-core-simd-lstm.wasm.js` + - Some code snippets found online set `corePath` to a specific `.js` file. This is **strongly discouraged.** To provide the best performance and lowest network usage, Tesseract.js needs to be able to pick between builds. - `langPath` path for downloading traineddata, do not include `/` at the end of the path - `workerPath` path for downloading worker script - `dataPath` path for saving traineddata in WebAssembly file system, not common to modify diff --git a/docs/image-format.md b/docs/image-format.md index 8f72f8d1a..b647ed420 100644 --- a/docs/image-format.md +++ b/docs/image-format.md @@ -2,7 +2,7 @@ The main Tesseract.js functions (ex. recognize, detect) take an `image` parameter. The image formats and data types supported are listed below. -Support Image Formats: **bmp, jpg, png, pbm, webp** +Support Image Formats: **bmp, jpg, png, pbm, webp, gif \[non-animated\]**. For browser and Node, supported data types are: - string with base64 encoded image (fits `data:image\/([a-zA-Z]*);base64,([^"]*)` regexp) @@ -15,4 +15,4 @@ For browser only, supported data types are: For Node only, supported data types are: - string containing a path to local image -Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported. \ No newline at end of file +Note: images must be a supported image format **and** a supported data type. For example, a buffer containing a png image is supported. A buffer containing raw pixel data is not supported. diff --git a/package-lock.json b/package-lock.json index 11cbe9ff0..8b72fe5d4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -4630,14 +4630,15 @@ } }, "node_modules/form-data": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.2.tgz", - "integrity": "sha512-hGfm/slu0ZabnNt4oaRZ6uREyfCj6P4fT/n6A1rGV+Z0VdGXjfOhVUpkn6qVQONHGIFwmveGXyDs75+nr6FM8w==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz", + "integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==", "dev": true, "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", "mime-types": "^2.1.12" }, "engines": { @@ -9292,9 +9293,9 @@ "dev": true }, "node_modules/tmp": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz", - "integrity": "sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==", + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.4.tgz", + "integrity": "sha512-UdiSoX6ypifLmrfQ/XfiawN6hkjSBpCjhKxxZcWlUUmoXLaCKQU0bx4HF/tdDK2uzRuchf1txGvrWBzYREssoQ==", "dev": true, "engines": { "node": ">=14.14" @@ -13625,14 +13626,15 @@ } }, "form-data": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.2.tgz", - "integrity": "sha512-hGfm/slu0ZabnNt4oaRZ6uREyfCj6P4fT/n6A1rGV+Z0VdGXjfOhVUpkn6qVQONHGIFwmveGXyDs75+nr6FM8w==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz", + "integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==", "dev": true, "requires": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, @@ -17005,9 +17007,9 @@ "dev": true }, "tmp": { - "version": "0.2.3", - "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.3.tgz", - "integrity": "sha512-nZD7m9iCPC5g0pYmcaxogYKggSfLsdxl8of3Q/oIbqCqLLIO9IAF0GWjX1z9NZRHPiXv8Wex4yDCaZsgEw0Y8w==", + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.2.4.tgz", + "integrity": "sha512-UdiSoX6ypifLmrfQ/XfiawN6hkjSBpCjhKxxZcWlUUmoXLaCKQU0bx4HF/tdDK2uzRuchf1txGvrWBzYREssoQ==", "dev": true }, "to-regex-range": { diff --git a/src/index.d.ts b/src/index.d.ts index 1f5a9c809..2c9689db3 100644 --- a/src/index.d.ts +++ b/src/index.d.ts @@ -41,7 +41,7 @@ declare namespace Tesseract { load_number_dawg: string load_bigram_dawg: string } - + type LoggerMessage = { jobId: string progress: number @@ -49,7 +49,7 @@ declare namespace Tesseract { userJobId: string workerId: string } - + interface WorkerOptions { corePath: string langPath: string @@ -161,12 +161,11 @@ declare namespace Tesseract { y0: number; x1: number; y1: number; - has_baseline: boolean; } interface RowAttributes { ascenders: number; descenders: number; - row_height: number; + rowHeight: number; } interface Bbox { x0: number; diff --git a/tests/recognize.test.mjs b/tests/recognize.test.mjs index 692e0872e..e11d34fde 100644 --- a/tests/recognize.test.mjs +++ b/tests/recognize.test.mjs @@ -318,6 +318,21 @@ describe('recognize()', () => { expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('繁體'); expect(blocks[0].paragraphs[0].lines[0].text).to.be('繁體 中 文 測試\n'); }).timeout(TIMEOUT); + + it('should report RowAttributes', async () => { + await worker.reinitialize('eng'); + const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/testocr.png`, {}, { blocks: true }); + const firstLine = blocks[0].paragraphs[0].lines[0]; + + expect(firstLine.rowAttributes).to.be.an('object'); + expect(firstLine.rowAttributes.ascenders).to.be.a('number'); + expect(firstLine.rowAttributes.descenders).to.be.a('number'); + expect(firstLine.rowAttributes.rowHeight).to.be.a('number'); + + expect(firstLine.rowAttributes.ascenders).to.be.greaterThan(0); + expect(firstLine.rowAttributes.descenders).to.be.greaterThan(0); + expect(firstLine.rowAttributes.rowHeight).to.be.greaterThan(0); + }).timeout(TIMEOUT); }); describe('should support layout blocks (json) output', () => {