@ARTICLE{26583204_375002082_2020, author = {Galina Zhukova and Yuri Smetanin and Mikhail Ulyanov}, keywords = {, word reconstruction, prefix, suffix, multiset of subwords, subwords of fixed lengthshift operator}, title = {About the possibility of determining the prefix and suffix of a word by subwords of fixed length}, journal = {}, year = {2020}, number = {2 Vol.14}, pages = {84-92}, url = {https://bijournal.hse.ru/en/2020--2 Vol.14/375002082.html}, publisher = {}, abstract = {      In applied problems of business informatics related to data analysis (in particular, in the analysis and forecasting of time series, in the study of log files of business processes, etc.), problems of qualitative analysis arise. Qualitative analysis methods often use symbolic coding as a way of presenting information about the processes under study. In a number of situations, due to the fragmentation of such descriptions, the problem arises of reconstructing a complete symbolic description of a process (word) from its successive fragments (subwords). From the multiset of all subwords of a sufficiently large length, the original word is uniquely restored. In the case of insufficiently long subwords, several different reconstructions of the original word are possible. The number of feasible reconstructions can be reduced by determining the suffix and prefix of the reconstructed word. A method is proposed for determining the prefix and suffix of a word consisting of symbols each on the basis of multiset of subwords of a fixed length equal to. We accept the hypothesis that this multiset is generated by a window of a fixed length of one symbol shift in an unknown word. The method for determining the prefix and suffix is based on the construction and analysis of the matrix formed by subwords from  written in rows in arbitrary order and the use of the operator acting on multisets of characters of the alphabet formed by neighboring columns of this matrix. The method is capable of determining the prefix and suffix, if for any from 1 to. If in the prefix and suffix only for some values of i, the characters in the corresponding positions are determined, and for the remaining characters. In the worst case, the method concludes that for any from 1 to, but does not determine the characters themselves. This is a situation in which the prefix and suffix coincide but cannot be determined.}, annote = {      In applied problems of business informatics related to data analysis (in particular, in the analysis and forecasting of time series, in the study of log files of business processes, etc.), problems of qualitative analysis arise. Qualitative analysis methods often use symbolic coding as a way of presenting information about the processes under study. In a number of situations, due to the fragmentation of such descriptions, the problem arises of reconstructing a complete symbolic description of a process (word) from its successive fragments (subwords). From the multiset of all subwords of a sufficiently large length, the original word is uniquely restored. In the case of insufficiently long subwords, several different reconstructions of the original word are possible. The number of feasible reconstructions can be reduced by determining the suffix and prefix of the reconstructed word. A method is proposed for determining the prefix and suffix of a word consisting of symbols each on the basis of multiset of subwords of a fixed length equal to. We accept the hypothesis that this multiset is generated by a window of a fixed length of one symbol shift in an unknown word. The method for determining the prefix and suffix is based on the construction and analysis of the matrix formed by subwords from  written in rows in arbitrary order and the use of the operator acting on multisets of characters of the alphabet formed by neighboring columns of this matrix. The method is capable of determining the prefix and suffix, if for any from 1 to. If in the prefix and suffix only for some values of i, the characters in the corresponding positions are determined, and for the remaining characters. In the worst case, the method concludes that for any from 1 to, but does not determine the characters themselves. This is a situation in which the prefix and suffix coincide but cannot be determined.} }