@ARTICLE{26583204_136992657_2014, author = {Yuri Smetanin and Mikhail Ulyanov}, keywords = {, generalized characteristics, words entropy assessment, measure of symbolic diversity, time seriessymbolic descriptions}, title = {

Measure of symbolical diversity: Combinatorics on words as an approach to identify generalized characteristics of time series

}, journal = {}, year = {2014}, number = {3 (29)}, pages = {40-48}, url = {https://bijournal.hse.ru/en/2014--3 (29)/136992657.html}, publisher = {}, abstract = {Yuri Smetanin - Chief Researcher, Dorodnitsyn Computing Centre, Russian Academy of Sciences.Address: 40, Vavilova street, Moscow, 119333, Russian Federation.E-mail: smetanin.iury2011@yandex.ruMikhail Ulyanov - Professor, Department of Applied Mathematics and Systems Modeling, Institute of Communications and Media Business, Moscow State University of Printing Arts; Professor, Software Management Department, School of Software Engineering, Faculty of Computer Science, National Research University Higher School of Economics.Address: 20, Myasnitskaya street, Moscow, 101000, Russian Federation.E-mail: muljanov@mail.ru     Currently various approaches to time series analysis are being investigated in terms of their forecasting. In the authors’ opinion, an approach to cluster analysis, which research object constitutes sets of time series generated by various sources, is of particular interest. The clusterization space is constructed by using generalized universal characteristics of time series each of which is a coordinate in this space. In such space for each time series there is a corresponding point in the coordinates of universal characteristics. Application of cluster analysis methods enables to identify time series that are space metric, and for the obtained clusters it is possible to solve the problem of choosing an efficient method of forecasting.     Construction of a special metric space to analyze time series constitutes the research object of this article. The research subject is this space coordinates - generalized characteristics of time series.In their previous articles, the authors have already defined two coordinates of such space: the Kolmogorov complexity of the time series and its harmonic complexity. This paper focuses on elaboration of a new generalized characteristic of time series by using combinatorics on words technique: a measure of symbolic diversity. The application of the symbolic coding approach enables to represent time series in a space of words in a selected alphabet. Investigation of the representation generated by combinatorics on words methods enables to estimate the entropy of shifts as a function of the length of the sliding window. A measure of symbolic diversity of time series has been proposed based on investigation of specifics of the first finite difference of this function. The proposed generalized characteristic may be applied for further identification of specific features of time series; in particular as one of the axes in the clusterization space.}, annote = {Yuri Smetanin - Chief Researcher, Dorodnitsyn Computing Centre, Russian Academy of Sciences.Address: 40, Vavilova street, Moscow, 119333, Russian Federation.E-mail: smetanin.iury2011@yandex.ruMikhail Ulyanov - Professor, Department of Applied Mathematics and Systems Modeling, Institute of Communications and Media Business, Moscow State University of Printing Arts; Professor, Software Management Department, School of Software Engineering, Faculty of Computer Science, National Research University Higher School of Economics.Address: 20, Myasnitskaya street, Moscow, 101000, Russian Federation.E-mail: muljanov@mail.ru     Currently various approaches to time series analysis are being investigated in terms of their forecasting. In the authors’ opinion, an approach to cluster analysis, which research object constitutes sets of time series generated by various sources, is of particular interest. The clusterization space is constructed by using generalized universal characteristics of time series each of which is a coordinate in this space. In such space for each time series there is a corresponding point in the coordinates of universal characteristics. Application of cluster analysis methods enables to identify time series that are space metric, and for the obtained clusters it is possible to solve the problem of choosing an efficient method of forecasting.     Construction of a special metric space to analyze time series constitutes the research object of this article. The research subject is this space coordinates - generalized characteristics of time series.In their previous articles, the authors have already defined two coordinates of such space: the Kolmogorov complexity of the time series and its harmonic complexity. This paper focuses on elaboration of a new generalized characteristic of time series by using combinatorics on words technique: a measure of symbolic diversity. The application of the symbolic coding approach enables to represent time series in a space of words in a selected alphabet. Investigation of the representation generated by combinatorics on words methods enables to estimate the entropy of shifts as a function of the length of the sliding window. A measure of symbolic diversity of time series has been proposed based on investigation of specifics of the first finite difference of this function. The proposed generalized characteristic may be applied for further identification of specific features of time series; in particular as one of the axes in the clusterization space.} }