Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9fa6c41

Browse files
[ai-form-recognizer] Lazy iterator for words of a line (Azure#18444)
* [ai-form-recognizer] Lazy iterator for words of a line * Use method instead of property * Regenerate API * Polished, wrote changelog, added some more tests, samples * Updated API MD * Improved docs * Apply changes from review
1 parent c28fe80 commit 9fa6c41

File tree

10 files changed

+396
-13
lines changed

10 files changed

+396
-13
lines changed

sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
# Release History
22

3-
## 4.0.0-beta.2 (Unreleased)
3+
## 4.0.0-beta.2 (2021-11-09)
44

55
### Features Added
66

7-
### Breaking Changes
7+
- Added a `words` method to `DocumentLine`. This method produces an `IterableIterator` that will yield all of the `DocumentWord`s that are contained by the line's `spans`. This allows accessing the words that are related to the line from the line itself.
88

99
### Bugs Fixed
1010

11-
### Other Changes
12-
1311
## 4.0.0-beta.1 (2021-10-07)
1412

1513
This new major version beta introduces a full redesign of the Azure Form Recognizer client library. To leverage features of the newest Form Recognizer service API (version "2021-09-30-preview" and newer), the new SDK is required, and application code must be changed to use the new clients. Please see the [Migration Guide](https://github.com/azure/azure-sdk-for-js/blob/main/sdk/formrecognizer/ai-form-recognizer/MIGRATION-v3_v4.md) for detailed instructions on how to update application code from version 3.x of the Form Recognizer SDK to the new version (4.x). The following sections contain an outline of the changes.

sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ export interface DocumentLine {
337337
boundingBox?: number[];
338338
content: string;
339339
spans: DocumentSpan[];
340+
words: () => IterableIterator<DocumentWord>;
340341
}
341342

342343
// @public

sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515

1616
import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer";
1717

18+
import * as dotenv from "dotenv";
19+
dotenv.config();
20+
1821
async function main() {
1922
const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? "<endpoint>";
2023
const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? "<api key>");

sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
import { DocumentModelAdministrationClient, AzureKeyCredential } from "@azure/ai-form-recognizer";
1616

17-
// Load the .env file if it exists
1817
import * as dotenv from "dotenv";
1918
dotenv.config();
2019

sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer";
1212

13+
import * as dotenv from "dotenv";
14+
dotenv.config();
15+
1316
async function main() {
1417
const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? "<endpoint>";
1518
const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? "<api key>");

sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,20 @@ async function main() {
3737
console.log("- Page", page.pageNumber, `(unit: ${page.unit})`);
3838
console.log(` ${page.width}x${page.height}, angle: ${page.angle}`);
3939
console.log(` ${page.lines.length} lines, ${page.words.length} words`);
40+
41+
if (page.lines.length > 0) {
42+
console.log(" Lines:");
43+
44+
for (const line of page.lines) {
45+
console.log(` - "${line.content}"`);
46+
47+
// The words of the line can also be iterated independently. The words are computed based on their
48+
// corresponding spans.
49+
for (const word of line.words()) {
50+
console.log(` - "${word.content}"`);
51+
}
52+
}
53+
}
4054
}
4155
}
4256

sdk/formrecognizer/ai-form-recognizer/src/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ export {
2222
DocumentFieldType,
2323
DocumentKeyValueElement,
2424
DocumentKeyValuePair,
25-
DocumentLine,
26-
DocumentPage,
2725
DocumentSelectionMark,
2826
DocumentSignatureType,
2927
DocumentSpan,
@@ -49,6 +47,8 @@ export {
4947
export {
5048
AnalysisPoller,
5149
AnalyzeResult,
50+
DocumentPage,
51+
DocumentLine,
5252
DocumentAnalysisPollOperationState,
5353
AnalyzedDocument,
5454
FormRecognizerRequestBody,

sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts

Lines changed: 236 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,14 @@ import {
1111
Document as GeneratedDocument,
1212
DocumentEntity,
1313
DocumentKeyValuePair,
14-
DocumentPage,
14+
DocumentPage as GeneratedDocumentPage,
15+
DocumentLine as GeneratedDocumentLine,
16+
DocumentSelectionMark,
1517
DocumentSpan,
1618
DocumentStyle,
1719
DocumentTable,
20+
DocumentWord,
21+
LengthUnit,
1822
} from "../generated";
1923
import { DocumentField, toAnalyzedDocumentFieldsFromGenerated } from "../models/fields";
2024
import { FormRecognizerApiVersion, PollerOptions } from "../options";
@@ -67,7 +71,6 @@ export interface AnalyzedDocument {
6771
* Transform a REST-level Document response object into the more strongly-typed AnalyzedDocument.
6872
*
6973
* @internal
70-
*
7174
* @param document - a REST-level document response object
7275
* @returns an AnalyzedDocument (which has had its fields mapped to stronger DocumentField types)
7376
*/
@@ -132,6 +135,236 @@ export interface AnalyzeResult<Document = AnalyzedDocument> {
132135
documents: Document[];
133136
}
134137

138+
/**
139+
* A page within an analysis result.
140+
*/
141+
export interface DocumentPage {
142+
/**
143+
* 1-based page number in the input document.
144+
*/
145+
pageNumber: number;
146+
147+
/**
148+
* The general orientation of the content in clockwise direction, measured in degrees between (-180, 180].
149+
*/
150+
angle: number;
151+
152+
/**
153+
* The width of the image/PDF in pixels/inches, respectively.
154+
*/
155+
width: number;
156+
157+
/**
158+
* The height of the image/PDF in pixels/inches, respectively.
159+
*/
160+
height: number;
161+
162+
/**
163+
* The unit used by the width, height, and boundingBox properties. For images, the unit is "pixel". For PDF, the unit is "inch".
164+
*/
165+
unit: LengthUnit;
166+
167+
/**
168+
* Location of the page in the reading order concatenated content.
169+
*/
170+
spans: DocumentSpan[];
171+
172+
/**
173+
* Extracted words from the page.
174+
*/
175+
words: DocumentWord[];
176+
177+
/**
178+
* Extracted selection marks from the page.
179+
*/
180+
selectionMarks?: DocumentSelectionMark[];
181+
182+
/**
183+
* Extracted lines from the page, potentially containing both textual and visual elements.
184+
*/
185+
lines: DocumentLine[];
186+
}
187+
188+
/**
189+
* Convert a REST-level DocumentPage into a convenience layer version.
190+
*
191+
* @internal
192+
* @param generated - a REST-level DocumentPage.
193+
* @returns
194+
*/
195+
export function toDocumentPageFromGenerated(generated: GeneratedDocumentPage): DocumentPage {
196+
// We will just overwrite the `lines` property with the transformed one rather than create a new object.
197+
generated.lines = generated.lines.map((line) => toDocumentLineFromGenerated(line, generated));
198+
199+
return generated as DocumentPage;
200+
}
201+
202+
/**
203+
* A line of adjacent content elements on a page.
204+
*/
205+
export interface DocumentLine {
206+
/**
207+
* Concatenated content of the contained elements in reading order.
208+
*/
209+
content: string;
210+
211+
/**
212+
* Bounding box of the line.
213+
*/
214+
boundingBox?: number[];
215+
216+
/**
217+
* Location of the line in the reading order concatenated content.
218+
*/
219+
spans: DocumentSpan[];
220+
221+
/**
222+
* Compute the `DocumentWord`s that are related to this line.
223+
*
224+
* This function produces a lazy iterator that will yield one word before computing the next.
225+
*/
226+
words: () => IterableIterator<DocumentWord>;
227+
}
228+
229+
/**
230+
* Tests if one span contains another, by testing that the outer span starts before or at the same character as the
231+
* inner span, and that the end position of the outer span is greater than or equal to the end position of the inner
232+
* span.
233+
*
234+
* @internal
235+
* @param outer - the outer (potentially containing) span
236+
* @param inner - the span to test if `outer` contains
237+
* @returns true if `inner` is contained inside of `outer`.
238+
*/
239+
export function contains(outer: DocumentSpan, inner: DocumentSpan): boolean {
240+
return outer.offset <= inner.offset && outer.offset + outer.length >= inner.offset + inner.length;
241+
}
242+
243+
/**
244+
* Make an empty generator. This might seem silly, but it's useful for satisfying invariants.
245+
*/
246+
function* empty(): Generator<never> {
247+
/* intentionally empty */
248+
}
249+
250+
/**
251+
* Produces an iterator of the given items starting from the given index.
252+
*
253+
* @param items - the items to iterate over
254+
* @param idx - the index of the first item to begin iterating from
255+
*/
256+
function* iterFrom<T>(items: T[], idx: number): Generator<T> {
257+
let i = idx;
258+
259+
while (i < items.length) {
260+
yield items[i++];
261+
}
262+
}
263+
264+
/**
265+
* Binary search through an array of items to find the first item that could possibly be contained by the given span,
266+
* then return an iterator beginning from that item.
267+
*
268+
* This allows a program to quickly find the first relevant item in the array for consideration when testing for span
269+
* inclusion.
270+
*
271+
* @internal
272+
* @param span - the span to use when testing each individual item
273+
* @param items - an array of items to binary search through
274+
* @returns an iterator beginning from the item identified by the search
275+
*/
276+
export function iteratorFromFirstMatchBinarySearch<Spanned extends { span: DocumentSpan }>(
277+
span: DocumentSpan,
278+
items: Spanned[]
279+
): IterableIterator<Spanned> {
280+
let idx = Math.floor(items.length / 2);
281+
let prevIdx = idx;
282+
let min = 0;
283+
let max = items.length;
284+
285+
const found = (): boolean =>
286+
// The item is found if it starts after the current span and the item before it does not. That means it is the first
287+
// item in the array that could be a child if the spans are sorted.
288+
items[idx].span.offset >= span.offset && (items[idx - 1]?.span?.offset ?? -1) < span.offset;
289+
290+
// Binary search to find the first element that could be a child
291+
do {
292+
if (found()) {
293+
return iterFrom(items, idx);
294+
} else if (span.offset > items[idx].span.offset) {
295+
min = prevIdx = idx;
296+
idx = Math.floor(idx + (max - idx) / 2);
297+
} else {
298+
max = prevIdx = idx;
299+
idx = Math.floor(idx - (idx - min) / 2);
300+
}
301+
} while (idx !== prevIdx);
302+
303+
// This might seem weird, but it's a simple way to make the types a little more elegant.
304+
return empty();
305+
}
306+
307+
/**
308+
* This fast algorithm tests the elements of `childArray` for inclusion in any of the given `spans`, assuming that both
309+
* the spans and child items are sorted.
310+
*
311+
* INVARIANT: the items in both the `spans` iterator and `childrenArray` MUST BE SORTED INCREASING by span _offset_.
312+
*
313+
* @internal
314+
* @param spans - the spans that contain the child elements
315+
* @param childrenArray - an array of child items (items that have spans) to test for inclusion in the spans
316+
* @returns - an IterableIterator of child items that are included in any span in the `spans` iterator
317+
*/
318+
export function* fastGetChildren<Spanned extends { span: DocumentSpan }>(
319+
spans: Iterator<DocumentSpan>,
320+
childrenArray: Spanned[]
321+
): Generator<Spanned> {
322+
let curSpan = spans.next();
323+
324+
// Need to exit early if there are no spans.
325+
if (curSpan.done) {
326+
return;
327+
}
328+
329+
const children = iteratorFromFirstMatchBinarySearch(curSpan.value as DocumentSpan, childrenArray);
330+
let curChild = children.next();
331+
332+
while (!(curChild.done || curSpan.done)) {
333+
if (contains(curSpan.value, curChild.value.span)) {
334+
// The span is contained, so yield the current child and advance it.
335+
yield curChild.value;
336+
curChild = children.next();
337+
} else if (curSpan.value.offset + curSpan.value.length < curChild.value.span.offset) {
338+
// The current span ends before the next potential child starts, so advance the span
339+
curSpan = spans.next();
340+
} else {
341+
// The current child was not contained in the current span, so advance to the next child.
342+
curChild = children.next();
343+
}
344+
}
345+
}
346+
347+
/**
348+
* Transforms a REST-level document line into a convenience layer version.
349+
*
350+
* @param generated - a REST-level DocumentLine
351+
* @param page - the page where the DocumentLine appeared
352+
* @returns a convenience layer DocumentLine
353+
*/
354+
function toDocumentLineFromGenerated(
355+
generated: GeneratedDocumentLine,
356+
page: GeneratedDocumentPage
357+
): DocumentLine {
358+
(generated as DocumentLine).words = () =>
359+
fastGetChildren(iterFrom(generated.spans, 0), page.words);
360+
361+
Object.defineProperty(generated, "words", {
362+
enumerable: false,
363+
});
364+
365+
return generated as DocumentLine;
366+
}
367+
135368
/**
136369
* The state of an analysis operation, which will eventually produce the result type that corresponds to the model.
137370
*/
@@ -192,7 +425,7 @@ export function toAnalyzeResultFromGenerated<
192425
apiVersion: result.apiVersion as FormRecognizerApiVersion,
193426
modelId: result.modelId,
194427
content: result.content,
195-
pages: result.pages,
428+
pages: result.pages.map((page) => toDocumentPageFromGenerated(page)),
196429
tables: result.tables ?? [],
197430
keyValuePairs: result.keyValuePairs ?? [],
198431
entities: result.entities ?? [],

sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT license.
33

4-
import { DocumentPage, DocumentStyle, DocumentTable } from "../generated";
5-
import { AnalyzeResult } from "../lro/analyze";
4+
import { DocumentStyle, DocumentTable } from "../generated";
5+
import { AnalyzeResult, DocumentPage, toDocumentPageFromGenerated } from "../lro/analyze";
66

77
/**
88
* Extract from an AnalyzeResult the fields that are produced from layout analysis.
@@ -12,7 +12,7 @@ export function toLayoutResult(analyzeResult: AnalyzeResult<unknown>): LayoutRes
1212
const { pages, tables, styles } = analyzeResult;
1313

1414
return {
15-
pages,
15+
pages: pages.map(toDocumentPageFromGenerated),
1616
tables,
1717
styles,
1818
};

0 commit comments

Comments
 (0)