pktools 2.6.7
Processing Kernel for geospatial data
pkstatascii.cc
1/**********************************************************************
2pkstatascii.cc: program to calculate basic statistics from text file
3Copyright (C) 2008-2014 Pieter Kempeneers
4
5This file is part of pktools
6
7pktools is free software: you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation, either version 3 of the License, or
10(at your option) any later version.
11
12pktools is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with pktools. If not, see <http://www.gnu.org/licenses/>.
19***********************************************************************/
20#include <iostream>
21#include <fstream>
22#include <vector>
23#include <math.h>
24#include "base/Optionpk.h"
25#include "fileclasses/FileReaderAscii.h"
26#include "algorithms/StatFactory.h"
27/******************************************************************************/
90using namespace std;
91
92int main(int argc, char *argv[])
93{
94 Optionpk<string> input_opt("i","input","name of the input text file");
95 Optionpk<char> fs_opt("fs","fs","field separator.",' ');
96 Optionpk<char> comment_opt("comment","comment","comment character",'#');
97 Optionpk<bool> output_opt("o","output","output the selected columns",false);
98 Optionpk<bool> transpose_opt("t","transpose","transpose input ascii vector (use in combination with --output)",false);
99 Optionpk<int> col_opt("c", "column", "column nr, starting from 0", 0);
100 Optionpk<int> range_opt("r", "range", "rows to start/end reading. Use -r 1 -r 10 to read first 10 rows where first row is header. Use 0 to read all rows with no header.", 0);
101 Optionpk<bool> size_opt("size","size","sample size",false);
102 Optionpk<unsigned int> rand_opt("rnd", "rnd", "generate random numbers", 0);
103 Optionpk<std::string> randdist_opt("dist", "dist", "distribution for generating random numbers, see http://www.gn/software/gsl/manual/gsl-ref_toc.html#TOC320 (only uniform and Gaussian supported yet)", "gaussian");
104 Optionpk<double> randa_opt("rnda", "rnda", "first parameter for random distribution (mean value in case of Gaussian)", 0);
105 Optionpk<double> randb_opt("rndb", "rndb", "second parameter for random distribution (standard deviation in case of Gaussian)", 1);
106 Optionpk<bool> mean_opt("mean","mean","calculate median",false);
107 Optionpk<bool> median_opt("median","median","calculate median",false);
108 Optionpk<bool> var_opt("var","var","calculate variance",false);
109 Optionpk<bool> skewness_opt("skew","skewness","calculate skewness",false);
110 Optionpk<bool> kurtosis_opt("kurt","kurtosis","calculate kurtosis",false);
111 Optionpk<bool> stdev_opt("stdev","stdev","calculate standard deviation",false);
112 Optionpk<bool> sum_opt("sum","sum","calculate sum of column",false);
113 Optionpk<bool> minmax_opt("mm","minmax","calculate minimum and maximum value",false);
114 Optionpk<bool> min_opt("min","min","calculate minimum value",false);
115 Optionpk<bool> max_opt("max","max","calculate maximum value",false);
116 Optionpk<double> src_min_opt("src_min","src_min","start reading source from this minimum value");
117 Optionpk<double> src_max_opt("src_max","src_max","stop reading source from this maximum value");
118 Optionpk<bool> histogram_opt("hist","hist","calculate histogram",false);
119 Optionpk<bool> histogram2d_opt("hist2d","hist2d","calculate 2-dimensional histogram based on two columns",false);
120 Optionpk<short> nbin_opt("nbin","nbin","number of bins to calculate histogram");
121 Optionpk<bool> relative_opt("rel","relative","use percentiles for histogram to calculate histogram",false);
122 Optionpk<bool> kde_opt("kde","kde","Use Kernel density estimation when producing histogram. The standard deviation is estimated based on Silverman's rule of thumb",false);
123 Optionpk<bool> correlation_opt("cor","correlation","calculate Pearson produc-moment correlation coefficient between two columns (defined by -c <col1> -c <col2>",false);
124 Optionpk<bool> rmse_opt("rmse","rmse","calculate root mean square error between two columns (defined by -c <col1> -c <col2>",false);
125 Optionpk<bool> reg_opt("reg","regression","calculate linear regression between two columns and get correlation coefficient (defined by -c <col1> -c <col2>",false);
126 Optionpk<bool> regerr_opt("regerr","regerr","calculate linear regression between two columns and get root mean square error (defined by -c <col1> -c <col2>",false);
127 Optionpk<short> verbose_opt("v", "verbose", "verbose mode when positive", 0,2);
128
129 src_min_opt.setHide(1);
130 src_max_opt.setHide(1);
131 fs_opt.setHide(1);
132 range_opt.setHide(1);
133 output_opt.setHide(1);
134 transpose_opt.setHide(1);
135 comment_opt.setHide(1);
136
137 bool doProcess;//stop process when program was invoked with help option (-h --help)
138 try{
139 //mandatory options
140 doProcess=input_opt.retrieveOption(argc,argv);
141 col_opt.retrieveOption(argc,argv);
142 //optional options
143 size_opt.retrieveOption(argc,argv);
144 rand_opt.retrieveOption(argc,argv);
145 randdist_opt.retrieveOption(argc,argv);
146 randa_opt.retrieveOption(argc,argv);
147 randb_opt.retrieveOption(argc,argv);
148 mean_opt.retrieveOption(argc,argv);
149 median_opt.retrieveOption(argc,argv);
150 var_opt.retrieveOption(argc,argv);
151 stdev_opt.retrieveOption(argc,argv);
152 skewness_opt.retrieveOption(argc,argv);
153 kurtosis_opt.retrieveOption(argc,argv);
154 sum_opt.retrieveOption(argc,argv);
155 minmax_opt.retrieveOption(argc,argv);
156 min_opt.retrieveOption(argc,argv);
157 max_opt.retrieveOption(argc,argv);
158 histogram_opt.retrieveOption(argc,argv);
159 nbin_opt.retrieveOption(argc,argv);
160 relative_opt.retrieveOption(argc,argv);
161 kde_opt.retrieveOption(argc,argv);
162 histogram2d_opt.retrieveOption(argc,argv);
163 correlation_opt.retrieveOption(argc,argv);
164 rmse_opt.retrieveOption(argc,argv);
165 reg_opt.retrieveOption(argc,argv);
166 regerr_opt.retrieveOption(argc,argv);
167 //advanced options
168 src_min_opt.retrieveOption(argc,argv);
169 src_max_opt.retrieveOption(argc,argv);
170 fs_opt.retrieveOption(argc,argv);
171 range_opt.retrieveOption(argc,argv);
172 output_opt.retrieveOption(argc,argv);
173 transpose_opt.retrieveOption(argc,argv);
174 comment_opt.retrieveOption(argc,argv);
175 verbose_opt.retrieveOption(argc,argv);
176 }
177 catch(string predefinedString){
178 std::cout << predefinedString << std::endl;
179 exit(0);
180 }
181 if(!doProcess){
182 cout << endl;
183 cout << "Usage: pkstatascii -i input [-c column]*" << endl;
184 cout << endl;
185 std::cout << "short option -h shows basic options only, use long option --help to show all options" << std::endl;
186 exit(0);//help was invoked, stop processing
187 }
188
189 if(src_min_opt.size()){
190 while(src_min_opt.size()<col_opt.size())
191 src_min_opt.push_back(src_min_opt[0]);
192 }
193 if(src_max_opt.size()){
194 while(src_max_opt.size()<col_opt.size())
195 src_max_opt.push_back(src_max_opt[0]);
196 }
198 if(rand_opt[0]>0){
199 gsl_rng* r=stat.getRandomGenerator(time(NULL));
200 //todo: init random number generator using time...
201 if(verbose_opt[0])
202 std::cout << "generating " << rand_opt[0] << " random numbers: " << std::endl;
203 for(unsigned int i=0;i<rand_opt[0];++i)
204 std::cout << i << " " << stat.getRandomValue(r,randdist_opt[0],randa_opt[0],randb_opt[0]) << std::endl;
205 }
206 vector< vector<double> > dataVector(col_opt.size());
207 vector< vector<double> > statVector(col_opt.size());
208
209 if(!input_opt.size())
210 exit(0);
211 FileReaderAscii asciiReader(input_opt[0]);
212 asciiReader.setFieldSeparator(fs_opt[0]);
213 asciiReader.setComment(comment_opt[0]);
214 asciiReader.setMinRow(range_opt[0]);
215 if(range_opt.size()>1)
216 asciiReader.setMaxRow(range_opt[1]);
217 asciiReader.readData(dataVector,col_opt);
218 assert(dataVector.size());
219 double minValue=0;
220 double maxValue=0;
221 unsigned int nbin=0;
222 if(nbin_opt.size())
223 nbin=nbin_opt[0];
224 if(histogram_opt[0]){
225 stat.minmax(dataVector[0],dataVector[0].begin(),dataVector[0].end(),minValue,maxValue);
226 if(src_min_opt.size())
227 minValue=src_min_opt[0];
228 if(src_max_opt.size())
229 maxValue=src_max_opt[0];
230 if(nbin<1){
231 std::cerr << "Warning: number of bins not defined, calculating bins from min and max value" << std::endl;
232 nbin=maxValue-minValue+1;
233 }
234 }
235 double minX=0;
236 double minY=0;
237 double maxX=0;
238 double maxY=0;
239 if(histogram2d_opt[0]){
240 assert(col_opt.size()==2);
241 if(nbin<1){
242 std::cerr << "Warning: number of bins not defined, calculating bins from min and max value" << std::endl;
243 stat.minmax(dataVector[0],dataVector[0].begin(),dataVector[0].end(),minX,maxX);
244 stat.minmax(dataVector[1],dataVector[1].begin(),dataVector[1].end(),minY,maxY);
245 if(src_min_opt.size())
246 minX=src_min_opt[0];
247 if(src_min_opt.size()>1)
248 minY=src_min_opt[1];
249 if(src_max_opt.size())
250 maxX=src_max_opt[0];
251 if(src_max_opt.size()>1)
252 maxY=src_max_opt[1];
253 minValue=(minX<minY)? minX:minY;
254 maxValue=(maxX>maxY)? maxX:maxY;
255 if(verbose_opt[0])
256 std::cout << "min and max values: " << minValue << ", " << maxValue << std::endl;
257 nbin=maxValue-minValue+1;
258 }
259 }
260 for(int icol=0;icol<col_opt.size();++icol){
261 if(!dataVector[icol].size()){
262 std::cerr << "Warning: dataVector[" << icol << "] is empty" << std::endl;
263 continue;
264 }
265 if(size_opt[0])
266 cout << "sample size column " << col_opt[icol] << ": " << dataVector[icol].size() << endl;
267 if(mean_opt[0])
268 cout << "mean value column " << col_opt[icol] << ": " << stat.mean(dataVector[icol]) << endl;
269 if(var_opt[0])
270 cout << "variance value column " << col_opt[icol] << ": " << stat.var(dataVector[icol]) << endl;
271 if(stdev_opt[0])
272 cout << "standard deviation column " << col_opt[icol] << ": " << sqrt(stat.var(dataVector[icol])) << endl;
273 if(skewness_opt[0])
274 cout << "skewness value column " << col_opt[icol] << ": " << stat.skewness(dataVector[icol]) << endl;
275 if(kurtosis_opt[0])
276 cout << "kurtosis value column " << col_opt[icol] << ": " << stat.kurtosis(dataVector[icol]) << endl;
277 if(sum_opt[0]){
278 cout << setprecision(2);
279 cout << fixed << "sum column " << col_opt[icol] << ": " << (stat.sum(dataVector[icol])) << endl;
280 }
281 if(median_opt[0])
282 cout << "median value column " << col_opt[icol] << ": " << stat.median(dataVector[icol]) << endl;
283 if(minmax_opt[0]){
284 cout << "min value column " << col_opt[icol] << ": " << stat.mymin(dataVector[icol]) << endl;
285 cout << "max value column " << col_opt[icol] << ": " << stat.mymax(dataVector[icol]) << endl;
286 }
287 if(min_opt[0])
288 cout << "min value column " << col_opt[icol] << ": " << stat.mymin(dataVector[icol]) << endl;
289 if(max_opt[0])
290 cout << "max value column " << col_opt[icol] << ": " << stat.mymax(dataVector[icol]) << endl;
291 if(histogram_opt[0]){
292 //todo: support kernel density function and estimate sigma as in practical estimate of the bandwith in http://en.wikipedia.org/wiki/Kernel_density_estimation
293 double sigma=0;
294 if(kde_opt[0]){//.size()){
295 // if(kde_opt[0]>0)
296 // sigma=kde_opt[0];
297 // else
298 sigma=1.06*sqrt(stat.var(dataVector[icol]))*pow(dataVector[icol].size(),-0.2);
299 }
300 assert(nbin);
301 if(verbose_opt[0]){
302 if(sigma>0)
303 std::cout << "calculating kernel density estimate with sigma " << sigma << " for col " << icol << std::endl;
304 else
305 std::cout << "calculating histogram for col " << icol << std::endl;
306 }
307 //test
308 // cout << "debug0" << endl;
309 // cout << "dataVector.size(): " << dataVector.size() << endl;
310 // cout << "statVector.size(): " << statVector.size() << endl;
311
312 // double theMinValue=0;
313 // double theMaxValue=0;
314
315 // stat.minmax(dataVector[icol],dataVector[icol].begin(),dataVector[icol].end(),theMinValue,theMaxValue);
316 // if(minValue<maxValue&&minValue>theMinValue)
317 // theMinValue=minValue;
318 // if(minValue<maxValue&&maxValue<theMaxValue)
319 // theMaxValue=maxValue;
320
321 // //todo: check...
322 // minValue=theMinValue;
323 // maxValue=theMaxValue;
324
325 // if(maxValue<=minValue){
326 // std::ostringstream s;
327 // s<<"Error: could not calculate distribution (min>=max)";
328 // throw(s.str());
329 // }
330 // assert(nbin);
331 // assert(dataVector[icol].size());
332 // if(statVector[icol].size()!=nbin){
333 // statVector[icol].resize(nbin);
334 // for(int i=0;i<nbin;statVector[icol][i++]=0);
335 // }
336 // typename std::vector<double>::const_iterator it;
337 // for(it=dataVector[icol].begin();it!=dataVector[icol].end();++it){
338 // if(*it<minValue)
339 // continue;
340 // if(*it>maxValue)
341 // continue;
342 // if(stat.isNoData(*it))
343 // continue;
344 // int theBin=0;
345 // if(*it==maxValue)
346 // theBin=nbin-1;
347 // else if(*it>minValue && *it<maxValue)
348 // theBin=static_cast<int>(static_cast<double>((nbin-1)*(*it)-minValue)/(maxValue-minValue));
349 // assert(theBin<statVector[icol].size());
350 // ++statVector[icol][theBin];
351 // // if(*it==maxValue)
352 // // ++statVector[icol][nbin-1];
353 // // else if(*it>=minValue && *it<maxValue)
354 // // ++statVector[icol][static_cast<int>(static_cast<double>((*it)-minValue)/(maxValue-minValue)*nbin)];
355 // }
356
357 // exit(0);
358 //end test
359
360 stat.distribution(dataVector[icol],dataVector[icol].begin(),dataVector[icol].end(),statVector[icol],nbin,minValue,maxValue,sigma);
361 if(verbose_opt[0])
362 std::cout << "min and max values: " << minValue << ", " << maxValue << std::endl;
363 }
364 }
365 if(correlation_opt[0]){
366 assert(dataVector.size()==2);
367 cout << "correlation between columns " << col_opt[0] << " and " << col_opt[1] << ": " << stat.correlation(dataVector[0],dataVector[1]) << endl;
368 }
369 if(rmse_opt[0]){
370 assert(dataVector.size()==2);
371 cout << "root mean square error between columns " << col_opt[0] << " and " << col_opt[1] << ": " << stat.rmse(dataVector[0],dataVector[1]) << endl;
372 }
373 if(reg_opt[0]){
374 assert(dataVector.size()==2);
375 double c0=0;
376 double c1=0;
377 double r2=stat.linear_regression(dataVector[0],dataVector[1],c0,c1);
378 cout << "linear regression between columns: " << col_opt[0] << " and " << col_opt[1] << ": " << c0 << "+" << c1 << " * x " << " with R^2 (square correlation coefficient): " << r2 << endl;
379 }
380 if(regerr_opt[0]){
381 assert(dataVector.size()==2);
382 double c0=0;
383 double c1=0;
384 double err=stat.linear_regression_err(dataVector[0],dataVector[1],c0,c1);
385 if(verbose_opt[0])
386 cout << "linear regression between columns: " << col_opt[0] << " and " << col_opt[1] << ": " << c0 << "+" << c1 << " * x " << " with rmse: " << err << endl;
387 else
388 cout << c0 << " " << c1 << " " << err << endl;
389 }
390 if(histogram_opt[0]){
391 for(int irow=0;irow<statVector.begin()->size();++irow){
392 double binValue=0;
393 if(nbin==maxValue-minValue+1)
394 binValue=minValue+irow;
395 else
396 binValue=minValue+static_cast<double>(maxValue-minValue)*(irow+0.5)/nbin;
397 std::cout << binValue << " ";
398 // std::cout << minValue+static_cast<double>(maxValue-minValue)*(irow+0.5)/nbin << " ";
399 for(int icol=0;icol<col_opt.size();++icol){
400 if(relative_opt[0])
401 std::cout << 100.0*static_cast<double>(statVector[icol][irow])/static_cast<double>(dataVector[icol].size());
402 else
403 std::cout << statVector[icol][irow];
404 if(icol<col_opt.size()-1)
405 cout << " ";
406 }
407 cout << endl;
408 }
409 }
410 if(histogram2d_opt[0]){
411 assert(nbin);
412 assert(dataVector.size()==2);
413 assert(dataVector[0].size()==dataVector[1].size());
414 double sigma=0;
415 //kernel density estimation as in http://en.wikipedia.org/wiki/Kernel_density_estimation
416 if(kde_opt[0]){
417 // if(kde_opt[0]>0)
418 // sigma=kde_opt[0];
419 // else
420 sigma=1.06*sqrt(sqrt(stat.var(dataVector[0]))*sqrt(stat.var(dataVector[1])))*pow(dataVector[0].size(),-0.2);
421 }
422 assert(nbin);
423 if(verbose_opt[0]){
424 if(sigma>0)
425 std::cout << "calculating 2d kernel density estimate with sigma " << sigma << " for cols " << col_opt[0] << " and " << col_opt[1] << std::endl;
426 else
427 std::cout << "calculating 2d histogram for cols " << col_opt[0] << " and " << col_opt[1] << std::endl;
428 std::cout << "nbin: " << nbin << std::endl;
429 }
430 std::vector< std::vector<double> > histVector;
431 stat.distribution2d(dataVector[0],dataVector[1],histVector,nbin,minX,maxX,minY,maxY,sigma);
432 for(int binX=0;binX<nbin;++binX){
433 std::cout << std::endl;
434 for(int binY=0;binY<nbin;++binY){
435 double binValueX=0;
436 if(nbin==maxX-minX+1)
437 binValueX=minX+binX;
438 else
439 binValueX=minX+static_cast<double>(maxX-minX)*(binX+0.5)/nbin;
440 double binValueY=0;
441 if(nbin==maxY-minY+1)
442 binValueY=minY+binY;
443 else
444 binValueY=minY+static_cast<double>(maxY-minY)*(binY+0.5)/nbin;
445 double value=0;
446 value=static_cast<double>(histVector[binX][binY])/dataVector[0].size();
447 std::cout << binValueX << " " << binValueY << " " << value << std::endl;
448 // std::cout << minX+static_cast<double>(maxX-minX)*(binX+0.5)/nbin << " " << minY+static_cast<double>(maxY-minY)*(binY+0.5)/nbin << " " << value << std::endl;
449 }
450 }
451 }
452
453 if(output_opt[0]){
454 if(transpose_opt[0]){
455 for(int icol=0;icol<col_opt.size();++icol){
456 for(int irow=0;irow<dataVector.begin()->size();++irow){
457 cout << dataVector[icol][irow];
458 if(irow<dataVector.begin()->size()-1)
459 cout << " ";
460 }
461 cout << endl;
462 }
463 }
464 else{
465 for(int irow=0;irow<dataVector.begin()->size();++irow){
466 for(int icol=0;icol<col_opt.size();++icol){
467 cout << dataVector[icol][irow];
468 if(icol<col_opt.size()-1)
469 cout << " ";
470 }
471 cout << endl;
472 }
473 }
474 }
475}