@ARTICLE{26583204_143009551_2014, author = {Mikhail Lanin}, keywords = {, data capture, semi-structured documents, structural description of document, reference elements, reference points, fields, fields layout, machine learningfrequency list}, title = {

Automatic detection of reference elements on semi-structured document images

}, journal = {}, year = {2014}, number = {4 (30)}, pages = {17-23}, url = {https://bijournal.hse.ru/en/2014--4 (30)/143009551.html}, publisher = {}, abstract = {Mikhail Lanin - Post-graduate student, Department of Images Recognition and Text Processing, Faculty of Innovations and High Technologies, Moscow Institute of Physics and Technology (State University); Software engineer, ABBYY Production.Address: 9, Institutskiy per., Dolgoprudny, Moscow Region, 141700, Russian Federation.E-mail: mike.lanin@gmail.com      The paper deals with automatic data extraction from semi-structured documents.  The through optical character recognition methods are slightly applicable for this kind of input.  To simplify the process to create structural descriptions of such documents machine learning methods are widely used, however, current solutions are still complicated for end-users, because these require manual description of document structure elements, which are not  directly relevant to date to be extracted.      The article presents a possible approach to describe variable structure document images used in document data capture system called ABBYY FlexiCapture and a method of automatic model creation based on layout of all structure elements. The paper provides a detailed description of an algorithm for automatic detection of reference elements based on user layout of data to be extracted that enables to facilitate dramatically the process of building of a structured model of an ABBYY FlexiCapture document from the user perspective. Integration of this technology at the data extraction validation stage enables to incrementally improve the structural model of a document, as it requires a user only to correct localization of wrongly found data being extracted. Finally, the paper describes a method to assess robustness of the proposed approach and test results. The described method involving detection of reference elements has shown its effectiveness in processing actual payment documents of a number of German suppliers: 89.3% of invoiced can be treated with no faults with minimum user intervention; furthermore, the data had been extracted correctly from 97.8% of fields.}, annote = {Mikhail Lanin - Post-graduate student, Department of Images Recognition and Text Processing, Faculty of Innovations and High Technologies, Moscow Institute of Physics and Technology (State University); Software engineer, ABBYY Production.Address: 9, Institutskiy per., Dolgoprudny, Moscow Region, 141700, Russian Federation.E-mail: mike.lanin@gmail.com      The paper deals with automatic data extraction from semi-structured documents.  The through optical character recognition methods are slightly applicable for this kind of input.  To simplify the process to create structural descriptions of such documents machine learning methods are widely used, however, current solutions are still complicated for end-users, because these require manual description of document structure elements, which are not  directly relevant to date to be extracted.      The article presents a possible approach to describe variable structure document images used in document data capture system called ABBYY FlexiCapture and a method of automatic model creation based on layout of all structure elements. The paper provides a detailed description of an algorithm for automatic detection of reference elements based on user layout of data to be extracted that enables to facilitate dramatically the process of building of a structured model of an ABBYY FlexiCapture document from the user perspective. Integration of this technology at the data extraction validation stage enables to incrementally improve the structural model of a document, as it requires a user only to correct localization of wrongly found data being extracted. Finally, the paper describes a method to assess robustness of the proposed approach and test results. The described method involving detection of reference elements has shown its effectiveness in processing actual payment documents of a number of German suppliers: 89.3% of invoiced can be treated with no faults with minimum user intervention; furthermore, the data had been extracted correctly from 97.8% of fields.} }