/* regression.c Simple Mandarin tone recognizer: best-fit regression routines Jim Gilsinan IV gilsinan@post.harvard.edu http://www.fas.harvard.edu/~gilsinan/ Senior Thesis Harvard University 2000-2001 */ #include #include #include #include "gaussian.h" //#define DEBUG // linear regression to find best fit of pitch track // y = a * x + b // start and end are the endpoints (inclusive of start but not end) // of the section of the syllable array we wish to examine. void linear_regression (double * in, int start, int end, double * a, double * b) { int x; // x axis for loop int n = end - start; double sum_x = 0, sum_y = 0, sum_xy = 0, sum_x_sq = 0; // compute b for (x = start; x < end; x++) { sum_y += in[x]; sum_x += x; sum_xy += (in[x] * x); sum_x_sq += x * x; } *b = ((n * sum_xy) - (sum_x * sum_y)) / ((n * sum_x_sq) - (sum_x * sum_x)); // compute a *a = (sum_y - (*b * sum_x)) / n; // output intensive debug routine / sanity check #ifdef DEBUG for ( x = start; x < end; x++) { printf("Actual: %f Predicted: %f\n", in[x], *a + *b * x); } #endif } // linear fit: given a best fit line and a data set, // find the average of the residuals. double linear_fit ( double * in, int start, int end, double a, double b) { double fit = 0; int x; for (x = start; x < end; x++) fit += fabsf((a * x + b) - in[x]); fit /= end - start; return fit; } // polynomial best-fit regression // given the pitch track (in), we find the coefficient // vector of the best-fit regression of specified degree, // between the given endpoints (again inclusive of start // but not end) void polynomial_regression (int deg, double * in, int start, int end, double *a) { int n; // number of data points in section int M = deg + 1; int i, j, k; #ifdef DEBUG double predicted; #endif double ** Eqn; /* This is our key analysis matrix, which we will * generate, and it has the following form (it is * the transpose of matrix A, times matrix A, * with an additional column, Aty, for the Gaussian * elimination routines): * * AtA = [ sum(i=1,n)(xi^0*xi^0),...,sum(i=1,n)(xi^0*xi^M)] * [ ... ... ... ] * [ sum(i=1,n)(xi^M*xi^0),...,sum(i=1,n)(xi^M*xi^M)] * * where M is the degree of the poly + 1. * * from this, we can solve * * (AtA) * a = (At) * y, where a is our solution * coefficient vector, and y is simply a vector * containing values from 0 to n, since our * samples are evenly unit spaced. */ n = end - start; // build Eqn Eqn = malloc(M * sizeof(double *)); for (i = 0; i < M + 1; i++) Eqn[i] = malloc(M * sizeof(double)); // clear the matrix for (i = 0; i < M + 1; i++) for (j = 0; j < M; j++) Eqn[i][j] = 0; for (i = 0; i < M; i++) for (j = 0; j < M; j++) for (k = 0; k < n; k++) Eqn[i][j] += powf(k, j) * powf(k, i); for (i = 0; i < M; i++) for (j = 0; j < n; j++) Eqn[M][i] += powf(j, i) * in[j]; // now we can use Gaussian elimination to solve if (GSolve(Eqn, M, a)) { fprintf(stderr, "Singularity in polynomial regression! Fatal.\n"); exit(1); } // sanity check #ifdef DEBUG for (i = 0; i < n; i++) { predicted = 0; for (j = 0; j < M; j++) predicted += a[j] * pow(i,j); printf ("Predicted: %f. Actual: %f\n", predicted, in[i]); } getchar(); #endif for (i = 1; i < M; i++) free(Eqn[i]); free(Eqn); }