From 78d6206d9f8b490d9930ad9e59b6e8081f6aea0f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 17 Jun 2021 17:22:39 +0200 Subject: [PATCH 1/2] add ALTO schema --- ocrd_validators/ocrd_validators/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd_validators/ocrd_validators/constants.py b/ocrd_validators/ocrd_validators/constants.py index 25d2e0e53b..a09a9f858c 100644 --- a/ocrd_validators/ocrd_validators/constants.py +++ b/ocrd_validators/ocrd_validators/constants.py @@ -15,6 +15,7 @@ 'OCRD_BAGIT_PROFILE_URL', 'XSD_METS_URL', 'XSD_PAGE_URL', + 'XSD_ALTO_URL', 'XSD_PATHS', ] @@ -29,6 +30,8 @@ OCRD_BAGIT_PROFILE_URL = 'https://ocr-d.github.io/bagit-profile.json' XSD_METS_URL = 'https://www.loc.gov/standards/mets/mets.xsd' XSD_PAGE_URL = 'http://www.primaresearch.org/schema/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd' +XSD_ALTO_URL = 'http://www.loc.gov/standards/alto/v4/alto-4-2.xsd' XSD_PATHS = {} XSD_PATHS[XSD_METS_URL] = resource_filename(__name__, 'mets.xsd') XSD_PATHS[XSD_PAGE_URL] = resource_filename(__name__, 'page.xsd') +XSD_PATHS[XSD_ALTO_URL] = resource_filename(__name__, 'alto.xsd') From ddec3c084d2dfcb09f6f592d37dc165c2aac83e1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Thu, 17 Jun 2021 17:24:38 +0200 Subject: [PATCH 2/2] add copy of ALTO v4.2 schema file --- ocrd_validators/ocrd_validators/alto.xsd | 1105 ++++++++++++++++++++++ 1 file changed, 1105 insertions(+) create mode 100644 ocrd_validators/ocrd_validators/alto.xsd diff --git a/ocrd_validators/ocrd_validators/alto.xsd b/ocrd_validators/ocrd_validators/alto.xsd new file mode 100644 index 0000000000..cfae77620b --- /dev/null +++ b/ocrd_validators/ocrd_validators/alto.xsd @@ -0,0 +1,1105 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ALTO (analyzed layout and text object) stores layout information and + OCR recognized text of pages of any kind of printed documents like books, journals and newspapers. + ALTO is a standardized XML format to store layout and content information. + It is designed to be used as an extension schema to METS (Metadata Encoding and Transmission Standard), + where METS provides metadata and structural information while ALTO contains content and physical information. + + + + + + + + Describes general settings of the alto file like measurement units and metadata + + + + + Styles define properties of layout elements. A style defined in a parent element is used as default style for all related children elements. + + + + + + Tag define properties of additional characteristic. The tags are referenced from related content element on Block or String element by attribute TAGREF via the tag ID. + This container element contains the individual elements for LayoutTags, StructureTags, RoleTags, NamedEntityTags and OtherTags + + + + + + The root layout element. + + + + + + Schema version of the ALTO file. + + + + + + + + + + Element deprecated. 'Processing' should be used instead. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + There are following variation of tag types available: + LayoutTag – criteria about arrangement or graphical appearance + StructureTag – criteria about grouping or formation + RoleTag – criteria about function or mission + NamedEntityTag – criteria about assignment of terms to their relationship / meaning (NER) + OtherTag – criteria about any other characteristic not listed above, the TYPE attribute is intended to be used for classification within those. + + + + + + + + + + + + + + + Gives brief information about original page quality + + + + + + + + + + + + + + Gives more details about the original page quality, since QUALITY attribute gives only brief and restrictive information + + + + + + Position of the page. Could be lefthanded, righthanded, cover, foldout or single if it has no special position. + + + + + + + + + + + + Page Confidence: Confidence level of the ocr for this page. A value between 0 (unsure) and 1 (sure). + + + + + + + + + One page of a book or journal. + + + + + The area between the top line of print and the upper edge of the leaf. It may contain page number or running title. + + + + + The area between the printspace and the left border of a page. May contain margin notes. + + + + + The area between the printspace and the right border of a page. May contain margin notes. + + + + + The area between the bottom line of letterpress or writing and the bottom edge of the leaf. It may contain a page number, a signature number or a catch word. + + + + + Rectangle covering the printed area of a page. Page number and running title are not part of the print space. + + + + + + + Any user-defined class like title page. + + + + + + + + + The number of the page within the document. + + + + + The page number that is printed on the page. + + + + + + + + A link to the processing description that has been used for this page. + + + + + Estimated percentage of OCR Accuracy in range from 0 to 100 + + + + + + + + + + + + + A text style defines font properties of text. + + + + + + + A paragraph style defines formatting properties of text blocks. + + + + + Indicates the alignement of the paragraph. Could be left, right, center or justify. + + + + + + + + + + + + + Left indent of the paragraph in relation to the column. + + + + + Right indent of the paragraph in relation to the column. + + + + + Line spacing between two lines of the paragraph. Measurement calculated from baseline to baseline. + + + + + Indent of the first line of the paragraph if this is different from the other lines. A negative value indicates an indent to the left, a positive value indicates an indent to the right. + + + + + + + + + + + + + + + + + + + + + + + + + + + Group of available block types + + + + + A block of text. + + + + + A picture or image. + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + A block that consists of other blocks + + + + + + + Base type for any kind of block on the page. + + + + + + + + + + + + + + + Tells the rotation of e.g. text or illustration within the block. The value is in degree counterclockwise. + + + + + The next block in reading sequence on the page. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + A white space. + + + + + + + + + + Type of the substitution (if any). + + + + + + + + + + + + + + + Word Confidence: Confidence level of the ocr for this string. A value between 0 (unsure) and 1 (sure). + + + + + + + + + + Any alternative for the word. + Alternative can outline a variant of writing by new typing / spelling rules, typically manually done or by dictionary replacements. + The above sample is an old composed character "Æ" of ancient time, which is replaced now by "Ä". + As variant are meant alternatives of the real printed content which are options outlined by the text recognition process. + Similar sample: "Straße" vs. "Strasse". Such alternatives are not coming from text recognition. + + + + + + + Identifies the purpose of the alternative. + + + + + + + + A sequence of chars. Strings are separated by white spaces or hyphenation chars. + + + + + + + + + + + + + + + + + + + + Content of the substitution. + + + + + + Confidence level of each character in that string. A list of numbers, one number between 0 (sure) and 9 (unsure) for each character. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + Attribute to record language of the string. The language should be recorded at the highest level possible. + + + + + + A region on a page + + + + + + + + + + + + + + + + + + A list of points + + + + + + Describes the bounding shape of a block, if it is not rectangular. + + + + + + + + + + A polygon shape. + + + + + + An ellipse shape. HPOS and VPOS describe the center of the ellipse. + HLENGTH and VLENGTH are the width and height of the described ellipse. + The attribute ROTATION tells the rotation of the e.g. text or + illustration within the block. The value is in degrees counterclockwise. + + + + + + + + + + A circle shape. HPOS and VPOS describe the center of the circle. + + + + + + + + Formatting attributes. Note that these attributes are assumed to be inherited from ancestor elements of the document hierarchy. + + + + The font name. + + + + + + + The font size, in points (1/72 of an inch). + + + + + Font color as RGB value + + + + + + + Serif or Sans-Serif + + + + + + + + + fixed or proportional + + + + + + + + + + + All measurement values inside the alto file are related to + this unit, except the font size. + Coordinates as being used in HPOS and VPOS are absolute coordinates referring to the upper-left corner of a page. + The upper left corner of the page is defined as coordinate (0/0). + + values meaning: + mm10: 1/10th of millimeter + inch1200: 1/1200th of inch + pixel: 1 pixel + + The values for pixel will be related to the resolution of the image based + on which the layout is described. Incase the original image is not known + the scaling factor can be calculated based on total width and height of + the image and the according information of the PAGE element. + + + + + + + + + + + Information to identify the image file from which the OCR text was created. + + + + + + + + + + + + + + + + + + + A unique identifier for the image file. This is drawn from MIX. + This identifier must be unique within the local system. + To facilitate file sharing or interoperability with other systems, fileIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + + + + + + + A unique identifier for the document. + This identifier must be unique within the local system. + To facilitate file sharing or interoperability with other systems, documentIdentifierLocation may be added to designate the system or application where the identifier is unique. + + + + + + A location qualifier, i.e., a namespace. + + + + + + + + Deprecated. processingType should be used instead. + Information on how the text was created, including preprocessing, OCR processing, and postprocessing steps. Where possible, this draws from MIX's change history. + + + + + + + + + + Description of the processing step. + + + + + Classification of the category of operation, how the file was created, including generation, modification, preprocessing, postprocessing or any other steps. + + + + + Date or DateTime the image was processed. + + + + + Identifies the organizationlevel producer(s) of the processed image. + + + + + An ordinal listing of the image processing steps performed. For example, "image despeckling." + + + + + A description of any setting of the processing application. For example, for a multi-engine OCR application this might include the engines which were used. Ideally, this description should be adequate so that someone else using the same application can produce identical results. + + + + + + + + + + + + + + + + + + + + + Information about a software application. Where applicable, the preferred method for determining this information is by selecting Help -- About. + + + + + The name of the organization or company that created the application. + + + + + The name of the application. + + + + + The version of the application. + + + + + A description of any important characteristics of the application, especially for non-commercial applications. For example, if a non-commercial application is built using commercial components, e.g., an OCR engine SDK. Those components should be mentioned here. + + + + + + + + + + List of any combination of font styles + + + + + + + + + + + + + + + + + + + + + + + A block that consists of other blocks + + + + + + + + + A user defined string to identify the type of composed block (e.g. table, advertisement, ...) + + + + + An ID to link to an image which contains only the composed block. The ID and the file link is defined in the related METS file. + + + + + + + + A picture or image. + + + + + + A user defined string to identify the type of illustration like photo, map, drawing, chart, ... + + + + + A link to an image which contains only the illustration. + + + + + + + + A graphic used to separate blocks. Usually a line or rectangle. + + + + + + + + A block of text. + + + + + + + A single line of text. + + + + + + + + + + + + + A hyphenation char. Can appear only at the end of a line. + + + + + + + + + + + + + + + + + + + + + Pixel coordinates based on the left-hand top corner of an image which define a polyline on which a line of text rests. + + + + + Attribute to record language of the textline. + + + + + Correction Status. Indicates whether manual correction has been done or not. The correction status should be recorded at the highest level possible (Block, TextLine, String). + + + + + + + + Attribute deprecated. LANG should be used instead. + + + + + Attribute to record language of the textblock. + + + + + + + + + + + The xml data wrapper element XmlData is used to contain XML encoded metadata. + The content of an XmlData element can be in any namespace or in no namespace. + As permitted by the XML Schema Standard, the processContents attribute value for the + metadata in an XmlData is set to “lax”. Therefore, if the source schema and its location are + identified by means of an XML schemaLocation attribute, then an XML processor will validate + the elements for which it can find declarations. If a source schema is not identified, or cannot be + found at the specified schemaLocation, then an XML validator will check for well-formedness, + but otherwise skip over the elements appearing in the XmlData element. + + + + + + + + + + + + + Type can be used to classify and group the information within each tag element type. + + + + + Content / information value of the tag. + + + + + Description text for tag information for clarification. + + + + + Any URI for authority or description relevant information. + + + + + + + Modern OCR software stores information on glyph level. A glyph is essentially a character or ligature. + Accordingly the value for the glyph element will be defined as follows: + Pre-composed representation = base + combining character(s) (decomposed representation) + See http://www.fileformat.info/info/unicode/char/0101/index.htm + "U+0101" = (U+0061) + (U+0304) + "combining characters" ("base characters" in combination with non-spacing marks or characters which are combined to one) are represented as one "glyph", e.g. áàâ. + + Each glyph has its own coordinate information and must be separately addressable as a distinct object. + Correction and verification processes can be carried out for individual characters. + + Post-OCR analysis of the text as well as adaptive OCR algorithm must be able to record information on glyph level. + In order to reproduce the decision of the OCR software, optional characters must be recorded. These are called variants. + The OCR software evaluates each variant and picks the one with the highest confidence score as the glyph. + The confidence score expresses how confident the OCR software is that a single glyph had been recognized correctly. + + The glyph elements are in order of the word. Each glyph need to be recorded to built up the whole word sequence. + + The glyph’s CONTENT attribute is no replacement for the string’s CONTENT attribute. + Due to post-processing steps such as correction the values of both attributes may be inconsistent. + + + + + + + + + + + CONTENT contains the precomposed representation (combining character) of the character from the parent String element. + The sequence position of the Gylph element matches the position of the character in the String. + + + + + + + + + + + + + This GC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence for the variant where is 1 is certain. + This attribute is optional. If it is not available, the default value for the variant is “0”. + The GC attribute semantic is the same as the WC attribute on the String element and VC on Variant element. + + + + + + + + + + + + + + + + + + Alternative (combined) character for the glyph, outlined by OCR engine or similar recognition processes. + In case the variant are two (combining) characters, two characters are outlined in one Variant element. + E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn". + Details for different use-cases see on the samples on GitHub. + + + + + + Each Variant represents an option for the glyph that the OCR software detected as possible alternatives. + In case the variant are two (combining) characters, two characters are outlined in one Variant element. + E.g. a Glyph element with CONTENT="m" can have a Variant element with the content "rn". + Details for different use-cases see on the samples on GitHub. + + + + + + + + + + + + + This VC attribute records a float value between 0.0 and 1.0 that expresses the level of confidence for the variant where is 1 is certain. + This attribute is optional. If it is not available, the default value for the variant is “0”. + The VC attribute semantic is the same as the GC attribute on the Glyph element. + + + + + + + + + + + \ No newline at end of file