677 lines
25 KiB
C++
677 lines
25 KiB
C++
/**********************************************************************************************/
|
|
/* This program is part of the Barcelona OpenMP Tasks Suite */
|
|
/* Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion */
|
|
/* Copyright (C) 2009 Universitat Politecnica de Catalunya */
|
|
/* */
|
|
/* This program is free software; you can redistribute it and/or modify */
|
|
/* it under the terms of the GNU General Public License as published by */
|
|
/* the Free Software Foundation; either version 2 of the License, or */
|
|
/* (at your option) any later version. */
|
|
/* */
|
|
/* This program is distributed in the hope that it will be useful, */
|
|
/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
|
|
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
|
|
/* GNU General Public License for more details. */
|
|
/* */
|
|
/* You should have received a copy of the GNU General Public License */
|
|
/* along with this program; if not, write to the Free Software */
|
|
/* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */
|
|
/**********************************************************************************************/
|
|
#include <iostream>
|
|
#include <random>
|
|
#include <chrono>
|
|
#include <cassert>
|
|
#include <fstream>
|
|
|
|
#pragma once
|
|
|
|
typedef double REAL;
|
|
typedef unsigned long PTR;
|
|
|
|
/* MATRIX_SIZE must be a power of 2 and a multiple of 16 */
|
|
#define MATRIX_SIZE 1024
|
|
|
|
/* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */
|
|
//#define CUTOFF_SIZE 2*1024
|
|
#define CUTOFF_SIZE 64
|
|
|
|
inline double *MatrixA, *MatrixB, *MatrixC;
|
|
|
|
|
|
/* ******************************************************************* */
|
|
/* STRASSEN APPLICATION CUT OFF's */
|
|
/* ******************************************************************* */
|
|
/* Strassen uses three different functions to compute Matrix Multiply. */
|
|
/* Each of them is related to an application cut off value: */
|
|
/* - Initial algorithm: OptimizedStrassenMultiply() */
|
|
/* - bots_app_cutoff_value: MultiplyByDivideAndConquer() */
|
|
/* - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */
|
|
/* ******************************************************************* */
|
|
|
|
/*FIXME: at the moment we use a constant value, change to parameter ???*/
|
|
/* Below this cut off strassen uses FastAdditiveNaiveMatrixMultiply algorithm */
|
|
#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
|
|
|
|
/***********************************************************************
|
|
* maximum tolerable relative error (for the checking routine)
|
|
**********************************************************************/
|
|
#define EPSILON (1.0E-6)
|
|
/***********************************************************************
|
|
* Matrices are stored in row-major order; A is a pointer to
|
|
* the first element of the matrix, and an is the number of elements
|
|
* between two rows. This macro produces the element A[i,j]
|
|
* given A, an, i and j
|
|
**********************************************************************/
|
|
#define ELEM(A, an, i, j) (A[(i)*(an)+(j)])
|
|
|
|
void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
|
|
unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
|
|
void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
|
|
unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
|
|
void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
|
|
unsigned MatrixSize,
|
|
unsigned RowWidthC,
|
|
unsigned RowWidthA,
|
|
unsigned RowWidthB,
|
|
int AdditiveMode
|
|
);
|
|
void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
|
|
unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
|
|
REAL *alloc_matrix(int n);
|
|
|
|
|
|
|
|
|
|
|
|
/************************************************************************
|
|
* Set an n by n matrix A to random values. The distance between
|
|
* rows is an
|
|
************************************************************************/
|
|
inline void init_matrix(int n, REAL *A, int an) {
|
|
int i, j;
|
|
|
|
for (i = 0; i < n; ++i)
|
|
for (j = 0; j < n; ++j)
|
|
ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX;
|
|
}
|
|
|
|
/************************************************************************
|
|
* Compare two matrices. Print an error message if they differ by
|
|
* more than EPSILON.
|
|
************************************************************************/
|
|
inline int compare_matrix(int n, REAL *A, int an, REAL *B, int bn) {
|
|
int i, j;
|
|
REAL c;
|
|
|
|
for (i = 0; i < n; ++i)
|
|
for (j = 0; j < n; ++j) {
|
|
/* compute the relative error c */
|
|
c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
|
|
if (c < 0.0)
|
|
c = -c;
|
|
|
|
c = c / ELEM(A, an, i, j);
|
|
if (c > EPSILON) {
|
|
std::cerr << "Strassen: Wrong answer! : " << c << '/' << EPSILON << std::endl;
|
|
// FAIL
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// SUCCESS
|
|
return 1;
|
|
}
|
|
|
|
|
|
|
|
/***********************************************************************
|
|
* Allocate a matrix of side n (therefore n^2 elements)
|
|
**********************************************************************/
|
|
inline REAL *alloc_matrix(int n) {
|
|
return static_cast<REAL*>(malloc(n * n * sizeof(REAL)));
|
|
}
|
|
|
|
|
|
/***********************************************************************
|
|
* Naive sequential algorithm, for comparison purposes
|
|
**********************************************************************/
|
|
inline void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn) {
|
|
int i, j, k;
|
|
REAL s;
|
|
|
|
for (i = 0; i < n; ++i)
|
|
{
|
|
for (j = 0; j < n; ++j)
|
|
{
|
|
s = 0.0;
|
|
for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
|
|
ELEM(C, cn, i, j) = s;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/***********************************************************************
|
|
* Naive sequential strassen algorithm, for comparison purposes
|
|
**********************************************************************/
|
|
inline void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
|
|
unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
|
|
{
|
|
unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
|
|
unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
|
|
+ 32;
|
|
unsigned Column, Row;
|
|
|
|
/************************************************************************
|
|
** For each matrix A, B, and C, we'll want pointers to each quandrant
|
|
** in the matrix. These quandrants will be addressed as follows:
|
|
** -- --
|
|
** | A11 A12 |
|
|
** | |
|
|
** | A21 A22 |
|
|
** -- --
|
|
************************************************************************/
|
|
REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
|
|
*A21, *B21, *C21, *A22, *B22, *C22;
|
|
|
|
REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
|
|
#define T2sMULT C22
|
|
#define NumberOfVariables 11
|
|
|
|
PTR TempMatrixOffset = 0;
|
|
PTR MatrixOffsetA = 0;
|
|
PTR MatrixOffsetB = 0;
|
|
|
|
char *Heap;
|
|
void *StartHeap;
|
|
|
|
/* Distance between the end of a matrix row and the start of the next row */
|
|
PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
|
|
PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
|
|
PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
|
|
|
|
if (MatrixSize <= CUTOFF_SIZE) {
|
|
MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
|
|
return;
|
|
}
|
|
|
|
///* Initialize quandrant matrices */
|
|
//#define A11 A
|
|
//#define B11 B
|
|
//#define C11 C
|
|
|
|
auto A11 = A;
|
|
auto B11 = B;
|
|
auto C11 = C;
|
|
|
|
A12 = A11 + QuadrantSize;
|
|
B12 = B11 + QuadrantSize;
|
|
C12 = C11 + QuadrantSize;
|
|
A21 = A + (RowWidthA * QuadrantSize);
|
|
B21 = B + (RowWidthB * QuadrantSize);
|
|
C21 = C + (RowWidthC * QuadrantSize);
|
|
A22 = A21 + QuadrantSize;
|
|
B22 = B21 + QuadrantSize;
|
|
C22 = C21 + QuadrantSize;
|
|
|
|
/* Allocate Heap Space Here */
|
|
StartHeap = Heap = static_cast<char*>(malloc(QuadrantSizeInBytes * NumberOfVariables));
|
|
/* ensure that heap is on cache boundary */
|
|
if ( ((PTR) Heap) & 31)
|
|
Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
|
|
|
|
/* Distribute the heap space over the variables */
|
|
S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
|
|
|
|
/***************************************************************************
|
|
** Step through all columns row by row (vertically)
|
|
** (jumps in memory by RowWidth => bad locality)
|
|
** (but we want the best locality on the innermost loop)
|
|
***************************************************************************/
|
|
for (Row = 0; Row < QuadrantSize; Row++) {
|
|
|
|
/*************************************************************************
|
|
** Step through each row horizontally (addressing elements in each column)
|
|
** (jumps linearly througn memory => good locality)
|
|
*************************************************************************/
|
|
for (Column = 0; Column < QuadrantSize; Column++) {
|
|
|
|
/***********************************************************
|
|
** Within this loop, the following holds for MatrixOffset:
|
|
** MatrixOffset = (Row * RowWidth) + Column
|
|
** (note: that the unit of the offset is number of reals)
|
|
***********************************************************/
|
|
/* Element of Global Matrix, such as A, B, C */
|
|
#define E(Matrix) (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
|
|
#define EA(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
|
|
#define EB(Matrix) (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
|
|
|
|
/* FIXME - may pay to expand these out - got higher speed-ups below */
|
|
/* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
|
|
E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
|
|
|
|
/* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
|
|
E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
|
|
|
|
/* S3 = A11 - A21 */
|
|
E(S3) = EA(A11) - EA(A21);
|
|
|
|
/* S7 = B22 - B12 */
|
|
E(S7) = EB(B22) - EB(B12);
|
|
|
|
TempMatrixOffset += sizeof(REAL);
|
|
MatrixOffsetA += sizeof(REAL);
|
|
MatrixOffsetB += sizeof(REAL);
|
|
} /* end row loop*/
|
|
|
|
MatrixOffsetA += RowIncrementA;
|
|
MatrixOffsetB += RowIncrementB;
|
|
} /* end column loop */
|
|
|
|
/* M2 = A11 x B11 */
|
|
OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
|
|
|
|
/* M5 = S1 * S5 */
|
|
OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
|
|
|
|
/* Step 1 of T1 = S2 x S6 + M2 */
|
|
OptimizedStrassenMultiply_seq(T1sMULT, S2, S6, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
|
|
|
|
/* Step 1 of T2 = T1 + S3 x S7 */
|
|
OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
|
|
|
|
/* Step 1 of C11 = M2 + A12 * B21 */
|
|
OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
|
|
|
|
/* Step 1 of C12 = S4 x B22 + T1 + M5 */
|
|
OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
|
|
|
|
/* Step 1 of C21 = T2 - A22 * S8 */
|
|
OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
|
|
|
|
/***************************************************************************
|
|
** Step through all columns row by row (vertically)
|
|
** (jumps in memory by RowWidth => bad locality)
|
|
** (but we want the best locality on the innermost loop)
|
|
***************************************************************************/
|
|
for (Row = 0; Row < QuadrantSize; Row++) {
|
|
/*************************************************************************
|
|
** Step through each row horizontally (addressing elements in each column)
|
|
** (jumps linearly througn memory => good locality)
|
|
*************************************************************************/
|
|
for (Column = 0; Column < QuadrantSize; Column += 4) {
|
|
REAL LocalM5_0 = *(M5);
|
|
REAL LocalM5_1 = *(M5+1);
|
|
REAL LocalM5_2 = *(M5+2);
|
|
REAL LocalM5_3 = *(M5+3);
|
|
REAL LocalM2_0 = *(M2);
|
|
REAL LocalM2_1 = *(M2+1);
|
|
REAL LocalM2_2 = *(M2+2);
|
|
REAL LocalM2_3 = *(M2+3);
|
|
REAL T1_0 = *(T1sMULT) + LocalM2_0;
|
|
REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
|
|
REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
|
|
REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
|
|
REAL T2_0 = *(C22) + T1_0;
|
|
REAL T2_1 = *(C22+1) + T1_1;
|
|
REAL T2_2 = *(C22+2) + T1_2;
|
|
REAL T2_3 = *(C22+3) + T1_3;
|
|
(*(C11)) += LocalM2_0;
|
|
(*(C11+1)) += LocalM2_1;
|
|
(*(C11+2)) += LocalM2_2;
|
|
(*(C11+3)) += LocalM2_3;
|
|
(*(C12)) += LocalM5_0 + T1_0;
|
|
(*(C12+1)) += LocalM5_1 + T1_1;
|
|
(*(C12+2)) += LocalM5_2 + T1_2;
|
|
(*(C12+3)) += LocalM5_3 + T1_3;
|
|
(*(C22)) = LocalM5_0 + T2_0;
|
|
(*(C22+1)) = LocalM5_1 + T2_1;
|
|
(*(C22+2)) = LocalM5_2 + T2_2;
|
|
(*(C22+3)) = LocalM5_3 + T2_3;
|
|
(*(C21 )) = (- *(C21 )) + T2_0;
|
|
(*(C21+1)) = (- *(C21+1)) + T2_1;
|
|
(*(C21+2)) = (- *(C21+2)) + T2_2;
|
|
(*(C21+3)) = (- *(C21+3)) + T2_3;
|
|
M5 += 4;
|
|
M2 += 4;
|
|
T1sMULT += 4;
|
|
C11 += 4;
|
|
C12 += 4;
|
|
C21 += 4;
|
|
C22 += 4;
|
|
}
|
|
C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
|
|
C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
|
|
C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
|
|
C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
|
|
}
|
|
free(StartHeap);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
**
|
|
** FastNaiveMatrixMultiply
|
|
**
|
|
** For small to medium sized matrices A, B, and C of size
|
|
** MatrixSize * MatrixSize this function performs the operation
|
|
** C = A x B efficiently.
|
|
**
|
|
** Note MatrixSize must be divisible by 8.
|
|
**
|
|
** INPUT:
|
|
** C = (*C WRITE) Address of top left element of matrix C.
|
|
** A = (*A IS READ ONLY) Address of top left element of matrix A.
|
|
** B = (*B IS READ ONLY) Address of top left element of matrix B.
|
|
** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
|
|
** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
|
|
** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
|
|
** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
|
|
**
|
|
** OUTPUT:
|
|
** C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
|
|
**
|
|
*****************************************************************************/
|
|
inline void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
|
|
unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) {
|
|
/* Assumes size of real is 8 bytes */
|
|
PTR RowWidthBInBytes = RowWidthB << 3;
|
|
PTR RowWidthAInBytes = RowWidthA << 3;
|
|
PTR MatrixWidthInBytes = MatrixSize << 3;
|
|
PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
|
|
unsigned Horizontal, Vertical;
|
|
|
|
REAL *ARowStart = A;
|
|
for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
|
|
for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
|
|
REAL *BColumnStart = B + Horizontal;
|
|
REAL FirstARowValue = *ARowStart++;
|
|
|
|
REAL Sum0 = FirstARowValue * (*BColumnStart);
|
|
REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
|
|
REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
|
|
REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
|
|
REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
|
|
REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
|
|
REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
|
|
REAL Sum7 = FirstARowValue * (*(BColumnStart+7));
|
|
|
|
unsigned Products;
|
|
for (Products = 1; Products < MatrixSize; Products++) {
|
|
REAL ARowValue = *ARowStart++;
|
|
BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
|
|
Sum0 += ARowValue * (*BColumnStart);
|
|
Sum1 += ARowValue * (*(BColumnStart+1));
|
|
Sum2 += ARowValue * (*(BColumnStart+2));
|
|
Sum3 += ARowValue * (*(BColumnStart+3));
|
|
Sum4 += ARowValue * (*(BColumnStart+4));
|
|
Sum5 += ARowValue * (*(BColumnStart+5));
|
|
Sum6 += ARowValue * (*(BColumnStart+6));
|
|
Sum7 += ARowValue * (*(BColumnStart+7));
|
|
}
|
|
ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
|
|
|
|
*(C) = Sum0;
|
|
*(C+1) = Sum1;
|
|
*(C+2) = Sum2;
|
|
*(C+3) = Sum3;
|
|
*(C+4) = Sum4;
|
|
*(C+5) = Sum5;
|
|
*(C+6) = Sum6;
|
|
*(C+7) = Sum7;
|
|
C+=8;
|
|
}
|
|
ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
|
|
C = (REAL*) ( ((PTR) C) + RowIncrementC );
|
|
}
|
|
}
|
|
|
|
|
|
/*****************************************************************************
|
|
**
|
|
** FastAdditiveNaiveMatrixMultiply
|
|
**
|
|
** For small to medium sized matrices A, B, and C of size
|
|
** MatrixSize * MatrixSize this function performs the operation
|
|
** C += A x B efficiently.
|
|
**
|
|
** Note MatrixSize must be divisible by 8.
|
|
**
|
|
** INPUT:
|
|
** C = (*C READ/WRITE) Address of top left element of matrix C.
|
|
** A = (*A IS READ ONLY) Address of top left element of matrix A.
|
|
** B = (*B IS READ ONLY) Address of top left element of matrix B.
|
|
** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
|
|
** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
|
|
** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
|
|
** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
|
|
**
|
|
** OUTPUT:
|
|
** C = (*C READ/WRITE) Matrix C contains C + A x B.
|
|
**
|
|
*****************************************************************************/
|
|
inline void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
|
|
unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB) {
|
|
/* Assumes size of real is 8 bytes */
|
|
PTR RowWidthBInBytes = RowWidthB << 3;
|
|
PTR RowWidthAInBytes = RowWidthA << 3;
|
|
PTR MatrixWidthInBytes = MatrixSize << 3;
|
|
PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
|
|
unsigned Horizontal, Vertical;
|
|
|
|
REAL *ARowStart = A;
|
|
for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
|
|
for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
|
|
REAL *BColumnStart = B + Horizontal;
|
|
|
|
REAL Sum0 = *C;
|
|
REAL Sum1 = *(C+1);
|
|
REAL Sum2 = *(C+2);
|
|
REAL Sum3 = *(C+3);
|
|
REAL Sum4 = *(C+4);
|
|
REAL Sum5 = *(C+5);
|
|
REAL Sum6 = *(C+6);
|
|
REAL Sum7 = *(C+7);
|
|
|
|
unsigned Products;
|
|
for (Products = 0; Products < MatrixSize; Products++) {
|
|
REAL ARowValue = *ARowStart++;
|
|
|
|
Sum0 += ARowValue * (*BColumnStart);
|
|
Sum1 += ARowValue * (*(BColumnStart+1));
|
|
Sum2 += ARowValue * (*(BColumnStart+2));
|
|
Sum3 += ARowValue * (*(BColumnStart+3));
|
|
Sum4 += ARowValue * (*(BColumnStart+4));
|
|
Sum5 += ARowValue * (*(BColumnStart+5));
|
|
Sum6 += ARowValue * (*(BColumnStart+6));
|
|
Sum7 += ARowValue * (*(BColumnStart+7));
|
|
|
|
BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
|
|
|
|
}
|
|
ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
|
|
|
|
*(C) = Sum0;
|
|
*(C+1) = Sum1;
|
|
*(C+2) = Sum2;
|
|
*(C+3) = Sum3;
|
|
*(C+4) = Sum4;
|
|
*(C+5) = Sum5;
|
|
*(C+6) = Sum6;
|
|
*(C+7) = Sum7;
|
|
C+=8;
|
|
}
|
|
|
|
ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
|
|
C = (REAL*) ( ((PTR) C) + RowIncrementC );
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/*****************************************************************************
|
|
**
|
|
** MultiplyByDivideAndConquer
|
|
**
|
|
** For medium to medium-large (would you like fries with that) sized
|
|
** matrices A, B, and C of size MatrixSize * MatrixSize this function
|
|
** efficiently performs the operation
|
|
** C = A x B (if AdditiveMode == 0)
|
|
** C += A x B (if AdditiveMode != 0)
|
|
**
|
|
** Note MatrixSize must be divisible by 16.
|
|
**
|
|
** INPUT:
|
|
** C = (*C READ/WRITE) Address of top left element of matrix C.
|
|
** A = (*A IS READ ONLY) Address of top left element of matrix A.
|
|
** B = (*B IS READ ONLY) Address of top left element of matrix B.
|
|
** MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
|
|
** RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
|
|
** RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
|
|
** RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
|
|
** AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
|
|
**
|
|
** OUTPUT:
|
|
** C (+)= A x B. (+ if AdditiveMode != 0)
|
|
**
|
|
*****************************************************************************/
|
|
inline void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
|
|
unsigned MatrixSize,
|
|
unsigned RowWidthC,
|
|
unsigned RowWidthA,
|
|
unsigned RowWidthB,
|
|
int AdditiveMode
|
|
)
|
|
{
|
|
#define A00 A
|
|
#define B00 B
|
|
#define C00 C
|
|
REAL *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
|
|
unsigned QuadrantSize = MatrixSize >> 1;
|
|
|
|
/* partition the matrix */
|
|
A01 = A00 + QuadrantSize;
|
|
A10 = A00 + RowWidthA * QuadrantSize;
|
|
A11 = A10 + QuadrantSize;
|
|
|
|
B01 = B00 + QuadrantSize;
|
|
B10 = B00 + RowWidthB * QuadrantSize;
|
|
B11 = B10 + QuadrantSize;
|
|
|
|
C01 = C00 + QuadrantSize;
|
|
C10 = C00 + RowWidthC * QuadrantSize;
|
|
C11 = C10 + QuadrantSize;
|
|
|
|
if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
|
|
|
|
MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
AdditiveMode);
|
|
|
|
MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
AdditiveMode);
|
|
|
|
MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
AdditiveMode);
|
|
|
|
MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
AdditiveMode);
|
|
|
|
MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
1);
|
|
|
|
MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
1);
|
|
|
|
MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
1);
|
|
|
|
MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB,
|
|
1);
|
|
|
|
} else {
|
|
|
|
if (AdditiveMode) {
|
|
FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
} else {
|
|
|
|
FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
}
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
|
|
FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
|
|
RowWidthC, RowWidthA, RowWidthB);
|
|
}
|
|
}
|
|
|
|
inline void dump_matrix(double *C, const unsigned sz, std::string&& fname) {
|
|
std::ofstream ofs {fname};
|
|
|
|
for(unsigned i=0; i<sz; i++) {
|
|
ofs << C[i] << ' ';
|
|
}
|
|
}
|
|
|
|
inline void init_ABC() {
|
|
MatrixA = alloc_matrix(MATRIX_SIZE);
|
|
MatrixB = alloc_matrix(MATRIX_SIZE);
|
|
MatrixC = alloc_matrix(MATRIX_SIZE);
|
|
|
|
::srand(1);
|
|
init_matrix(MATRIX_SIZE, MatrixA, MATRIX_SIZE);
|
|
init_matrix(MATRIX_SIZE, MatrixB, MATRIX_SIZE);
|
|
}
|
|
|
|
std::chrono::microseconds measure_time_omp(unsigned, REAL *, REAL *, REAL *, int);
|
|
std::chrono::microseconds measure_time_tbb(unsigned, REAL *, REAL *, REAL *, int);
|
|
std::chrono::microseconds measure_time_taskflow(unsigned, REAL *, REAL *, REAL *, int);
|
|
|