/**********************************************************************************************/ /* This program is part of the Barcelona OpenMP Tasks Suite */ /* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */ /* Copyright (C) 2009 Universitat Politecnica de Catalunya */ /* */ /* This program is free software; you can redistribute it and/or modify */ /* it under the terms of the GNU General Public License as published by */ /* the Free Software Foundation; either version 2 of the License, or */ /* (at your option) any later version. */ /* */ /* This program is distributed in the hope that it will be useful, */ /* but WITHOUT ANY WARRANTY; without even the implied warranty of */ /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ /* GNU General Public License for more details. */ /* */ /* You should have received a copy of the GNU General Public License */ /* along with this program; if not, write to the Free Software */ /* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ /**********************************************************************************************/ #include #include #include #include #include #pragma once typedef double REAL; typedef unsigned long PTR; /* MATRIX_SIZE must be a power of 2 and a multiple of 16 */ #define MATRIX_SIZE 1024 /* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */ //#define CUTOFF_SIZE 2*1024 #define CUTOFF_SIZE 64 inline double *MatrixA, *MatrixB, *MatrixC; /* ******************************************************************* */ /* STRASSEN APPLICATION CUT OFF's */ /* ******************************************************************* */ /* Strassen uses three different functions to compute Matrix Multiply. */ /* Each of them is related to an application cut off value: */ /* - Initial algorithm: OptimizedStrassenMultiply() */ /* - bots_app_cutoff_value: MultiplyByDivideAndConquer() */ /* - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */ /* ******************************************************************* */ /*FIXME: at the moment we use a constant value, change to parameter ???*/ /* Below this cut off strassen uses FastAdditiveNaiveMatrixMultiply algorithm */ #define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16 /*********************************************************************** * maximum tolerable relative error (for the checking routine) **********************************************************************/ #define EPSILON (1.0E-6) /*********************************************************************** * Matrices are stored in row-major order; A is a pointer to * the first element of the matrix, and an is the number of elements * between two rows. This macro produces the element A[i,j] * given A, an, i and j **********************************************************************/ #define ELEM(A, an, i, j) (A[(i)*(an)+(j)]) void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB); void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB); void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int AdditiveMode ); void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth); REAL *alloc_matrix(int n); /************************************************************************ * Set an n by n matrix A to random values. The distance between * rows is an ************************************************************************/ inline void init_matrix(int n, REAL *A, int an) { int i, j; for (i = 0; i < n; ++i) for (j = 0; j < n; ++j) ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; } /************************************************************************ * Compare two matrices. Print an error message if they differ by * more than EPSILON. ************************************************************************/ inline int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) { int i, j; REAL c; for (i = 0; i < n; ++i) for (j = 0; j < n; ++j) { /* compute the relative error c */ c = ELEM(A, an, i, j) - ELEM(B, bn, i, j); if (c < 0.0) c = -c; c = c / ELEM(A, an, i, j); if (c > EPSILON) { std::cerr << "Strassen: Wrong answer! : " << c << '/' << EPSILON << std::endl; // FAIL return 0; } } // SUCCESS return 1; } /*********************************************************************** * Allocate a matrix of side n (therefore n^2 elements) **********************************************************************/ inline REAL *alloc_matrix(int n) { return static_cast(malloc(n * n * sizeof(REAL))); } /*********************************************************************** * Naive sequential algorithm, for comparison purposes **********************************************************************/ inline void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn) { int i, j, k; REAL s; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { s = 0.0; for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j); ELEM(C, cn, i, j) = s; } } } /*********************************************************************** * Naive sequential strassen algorithm, for comparison purposes **********************************************************************/ inline void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth) { unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */ unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize + 32; unsigned Column, Row; /************************************************************************ ** For each matrix A, B, and C, we'll want pointers to each quandrant ** in the matrix. These quandrants will be addressed as follows: ** -- -- ** | A11 A12 | ** | | ** | A21 A22 | ** -- -- ************************************************************************/ REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12, *A21, *B21, *C21, *A22, *B22, *C22; REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT; #define T2sMULT C22 #define NumberOfVariables 11 PTR TempMatrixOffset = 0; PTR MatrixOffsetA = 0; PTR MatrixOffsetB = 0; char *Heap; void *StartHeap; /* Distance between the end of a matrix row and the start of the next row */ PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3; PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3; PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3; if (MatrixSize <= CUTOFF_SIZE) { MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0); return; } ///* Initialize quandrant matrices */ //#define A11 A //#define B11 B //#define C11 C auto A11 = A; auto B11 = B; auto C11 = C; A12 = A11 + QuadrantSize; B12 = B11 + QuadrantSize; C12 = C11 + QuadrantSize; A21 = A + (RowWidthA * QuadrantSize); B21 = B + (RowWidthB * QuadrantSize); C21 = C + (RowWidthC * QuadrantSize); A22 = A21 + QuadrantSize; B22 = B21 + QuadrantSize; C22 = C21 + QuadrantSize; /* Allocate Heap Space Here */ StartHeap = Heap = static_cast(malloc(QuadrantSizeInBytes * NumberOfVariables)); /* ensure that heap is on cache boundary */ if ( ((PTR) Heap) & 31) Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) ); /* Distribute the heap space over the variables */ S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes; S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes; M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes; M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes; T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes; /*************************************************************************** ** Step through all columns row by row (vertically) ** (jumps in memory by RowWidth => bad locality) ** (but we want the best locality on the innermost loop) ***************************************************************************/ for (Row = 0; Row < QuadrantSize; Row++) { /************************************************************************* ** Step through each row horizontally (addressing elements in each column) ** (jumps linearly througn memory => good locality) *************************************************************************/ for (Column = 0; Column < QuadrantSize; Column++) { /*********************************************************** ** Within this loop, the following holds for MatrixOffset: ** MatrixOffset = (Row * RowWidth) + Column ** (note: that the unit of the offset is number of reals) ***********************************************************/ /* Element of Global Matrix, such as A, B, C */ #define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) ) #define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) ) #define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) ) /* FIXME - may pay to expand these out - got higher speed-ups below */ /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */ E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) ); /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */ E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21); /* S3 = A11 - A21 */ E(S3) = EA(A11) - EA(A21); /* S7 = B22 - B12 */ E(S7) = EB(B22) - EB(B12); TempMatrixOffset += sizeof(REAL); MatrixOffsetA += sizeof(REAL); MatrixOffsetB += sizeof(REAL); } /* end row loop*/ MatrixOffsetA += RowIncrementA; MatrixOffsetB += RowIncrementB; } /* end column loop */ /* M2 = A11 x B11 */ OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1); /* M5 = S1 * S5 */ OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); /* Step 1 of T1 = S2 x S6 + M2 */ OptimizedStrassenMultiply_seq(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1); /* Step 1 of T2 = T1 + S3 x S7 */ OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1); /* Step 1 of C11 = M2 + A12 * B21 */ OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1); /* Step 1 of C12 = S4 x B22 + T1 + M5 */ OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1); /* Step 1 of C21 = T2 - A22 * S8 */ OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1); /*************************************************************************** ** Step through all columns row by row (vertically) ** (jumps in memory by RowWidth => bad locality) ** (but we want the best locality on the innermost loop) ***************************************************************************/ for (Row = 0; Row < QuadrantSize; Row++) { /************************************************************************* ** Step through each row horizontally (addressing elements in each column) ** (jumps linearly througn memory => good locality) *************************************************************************/ for (Column = 0; Column < QuadrantSize; Column += 4) { REAL LocalM5_0 = *(M5); REAL LocalM5_1 = *(M5+1); REAL LocalM5_2 = *(M5+2); REAL LocalM5_3 = *(M5+3); REAL LocalM2_0 = *(M2); REAL LocalM2_1 = *(M2+1); REAL LocalM2_2 = *(M2+2); REAL LocalM2_3 = *(M2+3); REAL T1_0 = *(T1sMULT) + LocalM2_0; REAL T1_1 = *(T1sMULT+1) + LocalM2_1; REAL T1_2 = *(T1sMULT+2) + LocalM2_2; REAL T1_3 = *(T1sMULT+3) + LocalM2_3; REAL T2_0 = *(C22) + T1_0; REAL T2_1 = *(C22+1) + T1_1; REAL T2_2 = *(C22+2) + T1_2; REAL T2_3 = *(C22+3) + T1_3; (*(C11)) += LocalM2_0; (*(C11+1)) += LocalM2_1; (*(C11+2)) += LocalM2_2; (*(C11+3)) += LocalM2_3; (*(C12)) += LocalM5_0 + T1_0; (*(C12+1)) += LocalM5_1 + T1_1; (*(C12+2)) += LocalM5_2 + T1_2; (*(C12+3)) += LocalM5_3 + T1_3; (*(C22)) = LocalM5_0 + T2_0; (*(C22+1)) = LocalM5_1 + T2_1; (*(C22+2)) = LocalM5_2 + T2_2; (*(C22+3)) = LocalM5_3 + T2_3; (*(C21 )) = (- *(C21 )) + T2_0; (*(C21+1)) = (- *(C21+1)) + T2_1; (*(C21+2)) = (- *(C21+2)) + T2_2; (*(C21+3)) = (- *(C21+3)) + T2_3; M5 += 4; M2 += 4; T1sMULT += 4; C11 += 4; C12 += 4; C21 += 4; C22 += 4; } C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC); C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC); C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC); C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC); } free(StartHeap); } /***************************************************************************** ** ** FastNaiveMatrixMultiply ** ** For small to medium sized matrices A, B, and C of size ** MatrixSize * MatrixSize this function performs the operation ** C = A x B efficiently. ** ** Note MatrixSize must be divisible by 8. ** ** INPUT: ** C = (*C WRITE) Address of top left element of matrix C. ** A = (*A IS READ ONLY) Address of top left element of matrix A. ** B = (*B IS READ ONLY) Address of top left element of matrix B. ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] ** ** OUTPUT: ** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.) ** *****************************************************************************/ inline void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) { /* Assumes size of real is 8 bytes */ PTR RowWidthBInBytes = RowWidthB << 3; PTR RowWidthAInBytes = RowWidthA << 3; PTR MatrixWidthInBytes = MatrixSize << 3; PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; unsigned Horizontal, Vertical; REAL *ARowStart = A; for (Vertical = 0; Vertical < MatrixSize; Vertical++) { for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { REAL *BColumnStart = B + Horizontal; REAL FirstARowValue = *ARowStart++; REAL Sum0 = FirstARowValue * (*BColumnStart); REAL Sum1 = FirstARowValue * (*(BColumnStart+1)); REAL Sum2 = FirstARowValue * (*(BColumnStart+2)); REAL Sum3 = FirstARowValue * (*(BColumnStart+3)); REAL Sum4 = FirstARowValue * (*(BColumnStart+4)); REAL Sum5 = FirstARowValue * (*(BColumnStart+5)); REAL Sum6 = FirstARowValue * (*(BColumnStart+6)); REAL Sum7 = FirstARowValue * (*(BColumnStart+7)); unsigned Products; for (Products = 1; Products < MatrixSize; Products++) { REAL ARowValue = *ARowStart++; BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); Sum0 += ARowValue * (*BColumnStart); Sum1 += ARowValue * (*(BColumnStart+1)); Sum2 += ARowValue * (*(BColumnStart+2)); Sum3 += ARowValue * (*(BColumnStart+3)); Sum4 += ARowValue * (*(BColumnStart+4)); Sum5 += ARowValue * (*(BColumnStart+5)); Sum6 += ARowValue * (*(BColumnStart+6)); Sum7 += ARowValue * (*(BColumnStart+7)); } ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); *(C) = Sum0; *(C+1) = Sum1; *(C+2) = Sum2; *(C+3) = Sum3; *(C+4) = Sum4; *(C+5) = Sum5; *(C+6) = Sum6; *(C+7) = Sum7; C+=8; } ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); C = (REAL*) ( ((PTR) C) + RowIncrementC ); } } /***************************************************************************** ** ** FastAdditiveNaiveMatrixMultiply ** ** For small to medium sized matrices A, B, and C of size ** MatrixSize * MatrixSize this function performs the operation ** C += A x B efficiently. ** ** Note MatrixSize must be divisible by 8. ** ** INPUT: ** C = (*C READ/WRITE) Address of top left element of matrix C. ** A = (*A IS READ ONLY) Address of top left element of matrix A. ** B = (*B IS READ ONLY) Address of top left element of matrix B. ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] ** ** OUTPUT: ** C = (*C READ/WRITE) Matrix C contains C + A x B. ** *****************************************************************************/ inline void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) { /* Assumes size of real is 8 bytes */ PTR RowWidthBInBytes = RowWidthB << 3; PTR RowWidthAInBytes = RowWidthA << 3; PTR MatrixWidthInBytes = MatrixSize << 3; PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3; unsigned Horizontal, Vertical; REAL *ARowStart = A; for (Vertical = 0; Vertical < MatrixSize; Vertical++) { for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) { REAL *BColumnStart = B + Horizontal; REAL Sum0 = *C; REAL Sum1 = *(C+1); REAL Sum2 = *(C+2); REAL Sum3 = *(C+3); REAL Sum4 = *(C+4); REAL Sum5 = *(C+5); REAL Sum6 = *(C+6); REAL Sum7 = *(C+7); unsigned Products; for (Products = 0; Products < MatrixSize; Products++) { REAL ARowValue = *ARowStart++; Sum0 += ARowValue * (*BColumnStart); Sum1 += ARowValue * (*(BColumnStart+1)); Sum2 += ARowValue * (*(BColumnStart+2)); Sum3 += ARowValue * (*(BColumnStart+3)); Sum4 += ARowValue * (*(BColumnStart+4)); Sum5 += ARowValue * (*(BColumnStart+5)); Sum6 += ARowValue * (*(BColumnStart+6)); Sum7 += ARowValue * (*(BColumnStart+7)); BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes); } ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes); *(C) = Sum0; *(C+1) = Sum1; *(C+2) = Sum2; *(C+3) = Sum3; *(C+4) = Sum4; *(C+5) = Sum5; *(C+6) = Sum6; *(C+7) = Sum7; C+=8; } ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes ); C = (REAL*) ( ((PTR) C) + RowIncrementC ); } } /***************************************************************************** ** ** MultiplyByDivideAndConquer ** ** For medium to medium-large (would you like fries with that) sized ** matrices A, B, and C of size MatrixSize * MatrixSize this function ** efficiently performs the operation ** C = A x B (if AdditiveMode == 0) ** C += A x B (if AdditiveMode != 0) ** ** Note MatrixSize must be divisible by 16. ** ** INPUT: ** C = (*C READ/WRITE) Address of top left element of matrix C. ** A = (*A IS READ ONLY) Address of top left element of matrix A. ** B = (*B IS READ ONLY) Address of top left element of matrix B. ** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n) ** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1] ** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1] ** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1] ** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B ** ** OUTPUT: ** C (+)= A x B. (+ if AdditiveMode != 0) ** *****************************************************************************/ inline void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B, unsigned MatrixSize, unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int AdditiveMode ) { #define A00 A #define B00 B #define C00 C REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11; unsigned QuadrantSize = MatrixSize >> 1; /* partition the matrix */ A01 = A00 + QuadrantSize; A10 = A00 + RowWidthA * QuadrantSize; A11 = A10 + QuadrantSize; B01 = B00 + QuadrantSize; B10 = B00 + RowWidthB * QuadrantSize; B11 = B10 + QuadrantSize; C01 = C00 + QuadrantSize; C10 = C00 + RowWidthC * QuadrantSize; C11 = C10 + QuadrantSize; if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) { MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, AdditiveMode); MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, 1); } else { if (AdditiveMode) { FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); } else { FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); } FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize, RowWidthC, RowWidthA, RowWidthB); } } inline void dump_matrix(double *C, const unsigned sz, std::string&& fname) { std::ofstream ofs {fname}; for(unsigned i=0; i