@@ -11,10 +11,14 @@ import {
11
11
Document as GeneratedDocument ,
12
12
DocumentEntity ,
13
13
DocumentKeyValuePair ,
14
- DocumentPage ,
14
+ DocumentPage as GeneratedDocumentPage ,
15
+ DocumentLine as GeneratedDocumentLine ,
16
+ DocumentSelectionMark ,
15
17
DocumentSpan ,
16
18
DocumentStyle ,
17
19
DocumentTable ,
20
+ DocumentWord ,
21
+ LengthUnit ,
18
22
} from "../generated" ;
19
23
import { DocumentField , toAnalyzedDocumentFieldsFromGenerated } from "../models/fields" ;
20
24
import { FormRecognizerApiVersion , PollerOptions } from "../options" ;
@@ -67,7 +71,6 @@ export interface AnalyzedDocument {
67
71
* Transform a REST-level Document response object into the more strongly-typed AnalyzedDocument.
68
72
*
69
73
* @internal
70
- *
71
74
* @param document - a REST-level document response object
72
75
* @returns an AnalyzedDocument (which has had its fields mapped to stronger DocumentField types)
73
76
*/
@@ -132,6 +135,236 @@ export interface AnalyzeResult<Document = AnalyzedDocument> {
132
135
documents : Document [ ] ;
133
136
}
134
137
138
+ /**
139
+ * A page within an analysis result.
140
+ */
141
+ export interface DocumentPage {
142
+ /**
143
+ * 1-based page number in the input document.
144
+ */
145
+ pageNumber : number ;
146
+
147
+ /**
148
+ * The general orientation of the content in clockwise direction, measured in degrees between (-180, 180].
149
+ */
150
+ angle : number ;
151
+
152
+ /**
153
+ * The width of the image/PDF in pixels/inches, respectively.
154
+ */
155
+ width : number ;
156
+
157
+ /**
158
+ * The height of the image/PDF in pixels/inches, respectively.
159
+ */
160
+ height : number ;
161
+
162
+ /**
163
+ * The unit used by the width, height, and boundingBox properties. For images, the unit is "pixel". For PDF, the unit is "inch".
164
+ */
165
+ unit : LengthUnit ;
166
+
167
+ /**
168
+ * Location of the page in the reading order concatenated content.
169
+ */
170
+ spans : DocumentSpan [ ] ;
171
+
172
+ /**
173
+ * Extracted words from the page.
174
+ */
175
+ words : DocumentWord [ ] ;
176
+
177
+ /**
178
+ * Extracted selection marks from the page.
179
+ */
180
+ selectionMarks ?: DocumentSelectionMark [ ] ;
181
+
182
+ /**
183
+ * Extracted lines from the page, potentially containing both textual and visual elements.
184
+ */
185
+ lines : DocumentLine [ ] ;
186
+ }
187
+
188
+ /**
189
+ * Convert a REST-level DocumentPage into a convenience layer version.
190
+ *
191
+ * @internal
192
+ * @param generated - a REST-level DocumentPage.
193
+ * @returns
194
+ */
195
+ export function toDocumentPageFromGenerated ( generated : GeneratedDocumentPage ) : DocumentPage {
196
+ // We will just overwrite the `lines` property with the transformed one rather than create a new object.
197
+ generated . lines = generated . lines . map ( ( line ) => toDocumentLineFromGenerated ( line , generated ) ) ;
198
+
199
+ return generated as DocumentPage ;
200
+ }
201
+
202
+ /**
203
+ * A line of adjacent content elements on a page.
204
+ */
205
+ export interface DocumentLine {
206
+ /**
207
+ * Concatenated content of the contained elements in reading order.
208
+ */
209
+ content : string ;
210
+
211
+ /**
212
+ * Bounding box of the line.
213
+ */
214
+ boundingBox ?: number [ ] ;
215
+
216
+ /**
217
+ * Location of the line in the reading order concatenated content.
218
+ */
219
+ spans : DocumentSpan [ ] ;
220
+
221
+ /**
222
+ * Compute the `DocumentWord`s that are related to this line.
223
+ *
224
+ * This function produces a lazy iterator that will yield one word before computing the next.
225
+ */
226
+ words : ( ) => IterableIterator < DocumentWord > ;
227
+ }
228
+
229
+ /**
230
+ * Tests if one span contains another, by testing that the outer span starts before or at the same character as the
231
+ * inner span, and that the end position of the outer span is greater than or equal to the end position of the inner
232
+ * span.
233
+ *
234
+ * @internal
235
+ * @param outer - the outer (potentially containing) span
236
+ * @param inner - the span to test if `outer` contains
237
+ * @returns true if `inner` is contained inside of `outer`.
238
+ */
239
+ export function contains ( outer : DocumentSpan , inner : DocumentSpan ) : boolean {
240
+ return outer . offset <= inner . offset && outer . offset + outer . length >= inner . offset + inner . length ;
241
+ }
242
+
243
+ /**
244
+ * Make an empty generator. This might seem silly, but it's useful for satisfying invariants.
245
+ */
246
+ function * empty ( ) : Generator < never > {
247
+ /* intentionally empty */
248
+ }
249
+
250
+ /**
251
+ * Produces an iterator of the given items starting from the given index.
252
+ *
253
+ * @param items - the items to iterate over
254
+ * @param idx - the index of the first item to begin iterating from
255
+ */
256
+ function * iterFrom < T > ( items : T [ ] , idx : number ) : Generator < T > {
257
+ let i = idx ;
258
+
259
+ while ( i < items . length ) {
260
+ yield items [ i ++ ] ;
261
+ }
262
+ }
263
+
264
+ /**
265
+ * Binary search through an array of items to find the first item that could possibly be contained by the given span,
266
+ * then return an iterator beginning from that item.
267
+ *
268
+ * This allows a program to quickly find the first relevant item in the array for consideration when testing for span
269
+ * inclusion.
270
+ *
271
+ * @internal
272
+ * @param span - the span to use when testing each individual item
273
+ * @param items - an array of items to binary search through
274
+ * @returns an iterator beginning from the item identified by the search
275
+ */
276
+ export function iteratorFromFirstMatchBinarySearch < Spanned extends { span : DocumentSpan } > (
277
+ span : DocumentSpan ,
278
+ items : Spanned [ ]
279
+ ) : IterableIterator < Spanned > {
280
+ let idx = Math . floor ( items . length / 2 ) ;
281
+ let prevIdx = idx ;
282
+ let min = 0 ;
283
+ let max = items . length ;
284
+
285
+ const found = ( ) : boolean =>
286
+ // The item is found if it starts after the current span and the item before it does not. That means it is the first
287
+ // item in the array that could be a child if the spans are sorted.
288
+ items [ idx ] . span . offset >= span . offset && ( items [ idx - 1 ] ?. span ?. offset ?? - 1 ) < span . offset ;
289
+
290
+ // Binary search to find the first element that could be a child
291
+ do {
292
+ if ( found ( ) ) {
293
+ return iterFrom ( items , idx ) ;
294
+ } else if ( span . offset > items [ idx ] . span . offset ) {
295
+ min = prevIdx = idx ;
296
+ idx = Math . floor ( idx + ( max - idx ) / 2 ) ;
297
+ } else {
298
+ max = prevIdx = idx ;
299
+ idx = Math . floor ( idx - ( idx - min ) / 2 ) ;
300
+ }
301
+ } while ( idx !== prevIdx ) ;
302
+
303
+ // This might seem weird, but it's a simple way to make the types a little more elegant.
304
+ return empty ( ) ;
305
+ }
306
+
307
+ /**
308
+ * This fast algorithm tests the elements of `childArray` for inclusion in any of the given `spans`, assuming that both
309
+ * the spans and child items are sorted.
310
+ *
311
+ * INVARIANT: the items in both the `spans` iterator and `childrenArray` MUST BE SORTED INCREASING by span _offset_.
312
+ *
313
+ * @internal
314
+ * @param spans - the spans that contain the child elements
315
+ * @param childrenArray - an array of child items (items that have spans) to test for inclusion in the spans
316
+ * @returns - an IterableIterator of child items that are included in any span in the `spans` iterator
317
+ */
318
+ export function * fastGetChildren < Spanned extends { span : DocumentSpan } > (
319
+ spans : Iterator < DocumentSpan > ,
320
+ childrenArray : Spanned [ ]
321
+ ) : Generator < Spanned > {
322
+ let curSpan = spans . next ( ) ;
323
+
324
+ // Need to exit early if there are no spans.
325
+ if ( curSpan . done ) {
326
+ return ;
327
+ }
328
+
329
+ const children = iteratorFromFirstMatchBinarySearch ( curSpan . value as DocumentSpan , childrenArray ) ;
330
+ let curChild = children . next ( ) ;
331
+
332
+ while ( ! ( curChild . done || curSpan . done ) ) {
333
+ if ( contains ( curSpan . value , curChild . value . span ) ) {
334
+ // The span is contained, so yield the current child and advance it.
335
+ yield curChild . value ;
336
+ curChild = children . next ( ) ;
337
+ } else if ( curSpan . value . offset + curSpan . value . length < curChild . value . span . offset ) {
338
+ // The current span ends before the next potential child starts, so advance the span
339
+ curSpan = spans . next ( ) ;
340
+ } else {
341
+ // The current child was not contained in the current span, so advance to the next child.
342
+ curChild = children . next ( ) ;
343
+ }
344
+ }
345
+ }
346
+
347
+ /**
348
+ * Transforms a REST-level document line into a convenience layer version.
349
+ *
350
+ * @param generated - a REST-level DocumentLine
351
+ * @param page - the page where the DocumentLine appeared
352
+ * @returns a convenience layer DocumentLine
353
+ */
354
+ function toDocumentLineFromGenerated (
355
+ generated : GeneratedDocumentLine ,
356
+ page : GeneratedDocumentPage
357
+ ) : DocumentLine {
358
+ ( generated as DocumentLine ) . words = ( ) =>
359
+ fastGetChildren ( iterFrom ( generated . spans , 0 ) , page . words ) ;
360
+
361
+ Object . defineProperty ( generated , "words" , {
362
+ enumerable : false ,
363
+ } ) ;
364
+
365
+ return generated as DocumentLine ;
366
+ }
367
+
135
368
/**
136
369
* The state of an analysis operation, which will eventually produce the result type that corresponds to the model.
137
370
*/
@@ -192,7 +425,7 @@ export function toAnalyzeResultFromGenerated<
192
425
apiVersion : result . apiVersion as FormRecognizerApiVersion ,
193
426
modelId : result . modelId ,
194
427
content : result . content ,
195
- pages : result . pages ,
428
+ pages : result . pages . map ( ( page ) => toDocumentPageFromGenerated ( page ) ) ,
196
429
tables : result . tables ?? [ ] ,
197
430
keyValuePairs : result . keyValuePairs ?? [ ] ,
198
431
entities : result . entities ?? [ ] ,
0 commit comments