c++ - How do I more efficiently multiply transpose matrices? -


i keep beating myself on head this. have sse-based algorithm multiplying matrix a matrix b. need implement operations a, b, or both transposed. did naive implementation of it, 4x4 matrix code represented below (which pretty standard sse operations, think), a*b^t operation takes twice long a*b. atlas implementation returns similar values a*b, , identical results multiplying transpose, suggests me there efficient way this.

mm-multiplication:

m1 = (mat1.m_>>2)<<2; n2 = (mat2.n_>>2)<<2; n  = (mat1.n_>>2)<<2;  (k=0; k<n; k+=4) {   (i=0; i<m1; i+=4) {     // fetch: 4x4 matrix mat1     // row-major storage, 4 rows     float* a0 = mat1.el_[i]+k;     float* a1 = mat1.el_[i+1]+k;     float* a2 = mat1.el_[i+2]+k;     float* a3 = mat1.el_[i+3]+k;      (j=0; j<n2; j+=4) {       // fetch: 4x4 matrix mat2       // row-major storage, 4 rows       float* b0 = mat2.el_[k]+j;       float* b1 = mat2.el_[k+1]+j;       float* b2 = mat2.el_[k+2]+j;       float* b3 = mat2.el_[k+3]+j;        __m128 b0r = _mm_loadu_ps(b0);       __m128 b1r = _mm_loadu_ps(b1);       __m128 b2r = _mm_loadu_ps(b2);       __m128 b3r = _mm_loadu_ps(b3);        {  // first row of result += first row of mat1 * 4x4 of mat2         __m128 cx1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a0+0), b0r), _mm_mul_ps(_mm_load_ps1(a0+1), b1r));         __m128 cx2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a0+2), b2r), _mm_mul_ps(_mm_load_ps1(a0+3), b3r));         float* c0 = this->el_[i]+j;         _mm_storeu_ps(c0, _mm_add_ps(_mm_add_ps(cx1, cx2), _mm_loadu_ps(c0)));       }        { // second row of result += second row of mat1 * 4x4 of mat2         __m128 cx1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a1+0), b0r), _mm_mul_ps(_mm_load_ps1(a1+1), b1r));         __m128 cx2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a1+2), b2r), _mm_mul_ps(_mm_load_ps1(a1+3), b3r));         float* c1 = this->el_[i+1]+j;         _mm_storeu_ps(c1, _mm_add_ps(_mm_add_ps(cx1, cx2), _mm_loadu_ps(c1)));       }        { // third row of result += third row of mat1 * 4x4 of mat2         __m128 cx1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a2+0), b0r), _mm_mul_ps(_mm_load_ps1(a2+1), b1r));         __m128 cx2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a2+2), b2r), _mm_mul_ps(_mm_load_ps1(a2+3), b3r));         float* c2 = this->el_[i+2]+j;         _mm_storeu_ps(c2, _mm_add_ps(_mm_add_ps(cx1, cx2), _mm_loadu_ps(c2)));       }        { // fourth row of result += fourth row of mat1 * 4x4 of mat2         __m128 cx1 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a3+0), b0r), _mm_mul_ps(_mm_load_ps1(a3+1), b1r));         __m128 cx2 = _mm_add_ps(_mm_mul_ps(_mm_load_ps1(a3+2), b2r), _mm_mul_ps(_mm_load_ps1(a3+3), b3r));         float* c3 = this->el_[i+3]+j;         _mm_storeu_ps(c3, _mm_add_ps(_mm_add_ps(cx1, cx2), _mm_loadu_ps(c3)));       }   } // code omitted handle remaining rows , columns } 

for mt multiplication (matrix multiplied transpose matrix), stead stored b0r b3r following commands , changed loop variables appropriately:

__m128 b0r = _mm_set_ps(b3[0], b2[0], b1[0], b0[0]); __m128 b1r = _mm_set_ps(b3[1], b2[1], b1[1], b0[1]); __m128 b2r = _mm_set_ps(b3[2], b2[2], b1[2], b0[2]); __m128 b3r = _mm_set_ps(b3[3], b2[3], b1[3], b0[3]); 

i suspect slowdown partly due difference between pulling in row @ time , having store 4 values each time column, feel other way of going this, pulling in rows of b , multiplying column of as, shift cost on storing 4 columns of results.

i have tried pulling in b's rows rows , using _mm_transpose4_ps(b0r, b1r, b2r, b3r); transposition (i thought there might additional optimizations in macro), there's no real improvement.

on surface, feel should faster... dot products involved row row, seems inherently more efficient, trying dot products straight results in having same thing store results.

what missing here?

added: clarify, i'm trying not transpose matrices. i'd prefer iterate along them. problem, best can tell, _mm_set_ps command slower _mm_load_ps.

i tried variation stored 4 rows of matrix , replaced 4 curly-bracketed segments containing 1 load, 4 multiplies, , 2 adds 4 multiply instructions , 3 hadds, little avail. timing stays same (and yes, tried debug statement verify code had changed in test compile. said debug statement removed before profiling, of course):

    {  // first row of result += first row of mat1 * 4x4 of mat2       __m128 cx1 = _mm_hadd_ps(_mm_mul_ps(a0r, b0r), _mm_mul_ps(a0r, b1r));       __m128 cx2 = _mm_hadd_ps(_mm_mul_ps(a0r, b2r), _mm_mul_ps(a0r, b3r));       float* c0 = this->el_[i]+j;       _mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cx1, cx2), _mm_loadu_ps(c0)));     }      { // second row of result += second row of mat1 * 4x4 of mat2       __m128 cx1 = _mm_hadd_ps(_mm_mul_ps(a1r, b0r), _mm_mul_ps(a1r, b1r));       __m128 cx2 = _mm_hadd_ps(_mm_mul_ps(a1r, b2r), _mm_mul_ps(a1r, b3r));       float* c0 = this->el_[i+1]+j;       _mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cx1, cx2), _mm_loadu_ps(c0)));     }      { // third row of result += third row of mat1 * 4x4 of mat2       __m128 cx1 = _mm_hadd_ps(_mm_mul_ps(a2r, b0r), _mm_mul_ps(a2r, b1r));       __m128 cx2 = _mm_hadd_ps(_mm_mul_ps(a2r, b2r), _mm_mul_ps(a2r, b3r));       float* c0 = this->el_[i+2]+j;       _mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cx1, cx2), _mm_loadu_ps(c0)));     }      { // fourth row of result += fourth row of mat1 * 4x4 of mat2       __m128 cx1 = _mm_hadd_ps(_mm_mul_ps(a3r, b0r), _mm_mul_ps(a3r, b1r));       __m128 cx2 = _mm_hadd_ps(_mm_mul_ps(a3r, b2r), _mm_mul_ps(a3r, b3r));       float* c0 = this->el_[i+3]+j;       _mm_storeu_ps(c0, _mm_add_ps(_mm_hadd_ps(cx1, cx2), _mm_loadu_ps(c0)));     } 

update: right, , moving loading of rows of a0r a3r curly braces in attempt avoid register thrashing failed well.

i think 1 few cases horizontal add useful. want c = ab^t b not stored in memory transpose. that's problem. it's store aos instead of soa. in case taking transpose of b , doing vertical add slower using horizontal add think. @ least true matrixvector efficient 4x4 matrix vector multiplication sse: horizontal add , dot product - what's point?. in code below function m4x4 non sse 4x4 matrix-product, m4x4_vec uses sse, m4x4t c=ab^t without sse, , m4x4t_vec c=ab^t usisg sse. last 1 one want think.

note: larger matrices not use method. in case it's faster take transpose first , use vertical add (with sse/avx more complicated, transpose strips sse/avx width). that's because transpose goes o(n^2) , matrix product goes o(n^3) large matrices transpose insignificant. however, 4x4 transpose significant horizontal add wins.

edit: misunderstood wanted. want c = (ab)^t. should fast (ab) , code same swap roles of , b.
can write math follows:

c = a*b in einstein notation c_i,j = a_i,k * b_k,j.   since (a*b)^t = b^t*a^t can write  c = (a*b)^t in einstein notation c_i,j = b^t_i,k * a^t_k,j = a_j,k * b_k,i 

if compare 2 thing changes swap roles of j , i. put code @ end of answer.

#include "stdio.h" #include <nmmintrin.h>      void m4x4(const float *a, const float *b, float *c) {     for(int i=0; i<4; i++) {         for(int j=0; j<4; j++) {             float sum = 0.0f;             for(int k=0; k<4; k++) {                 sum += a[i*4+k]*b[k*4+j];             }             c[i*4 + j] = sum;         }     } }  void m4x4t(const float *a, const float *b, float *c) {     for(int i=0; i<4; i++) {         for(int j=0; j<4; j++) {             float sum = 0.0f;             for(int k=0; k<4; k++) {                 sum += a[i*4+k]*b[j*4+k];             }             c[i*4 + j] = sum;         }     } }  void m4x4_vec(const float *a, const float *b, float *c) {     __m128 brow[4], mrow[4];     for(int i=0; i<4; i++) {         brow[i] = _mm_load_ps(&b[4*i]);     }      for(int i=0; i<4; i++) {         mrow[i] = _mm_set1_ps(0.0f);         for(int j=0; j<4; j++) {             __m128 = _mm_set1_ps(a[4*i +j]);             mrow[i] = _mm_add_ps(mrow[i], _mm_mul_ps(a, brow[j]));         }     }     for(int i=0; i<4; i++) {         _mm_store_ps(&c[4*i], mrow[i]);     } }  void m4x4t_vec(const float *a, const float *b, float *c) {     __m128 arow[4], brow[4], mrow[4];     for(int i=0; i<4; i++) {         arow[i] = _mm_load_ps(&a[4*i]);         brow[i] = _mm_load_ps(&b[4*i]);     }      for(int i=0; i<4; i++) {         __m128 prod[4];         for(int j=0; j<4; j++) {             prod[j] =  _mm_mul_ps(arow[i], brow[j]);         }         mrow[i] = _mm_hadd_ps(_mm_hadd_ps(prod[0], prod[1]), _mm_hadd_ps(prod[2], prod[3]));         }     for(int i=0; i<4; i++) {         _mm_store_ps(&c[4*i], mrow[i]);     }  }  float compare_4x4(const float* a, const float*b) {     float diff = 0.0f;     for(int i=0; i<4; i++) {         for(int j=0; j<4; j++) {             diff += a[i*4 +j] - b[i*4+j];             printf("a %f, b %f\n", a[i*4 +j], b[i*4 +j]);         }     }     return diff;     }  int main() {     float *a = (float*)_mm_malloc(sizeof(float)*16,16);     float *b = (float*)_mm_malloc(sizeof(float)*16,16);     float *c1 = (float*)_mm_malloc(sizeof(float)*16,16);     float *c2 = (float*)_mm_malloc(sizeof(float)*16,16);      for(int i=0; i<4; i++) {         for(int j=0; j<4; j++) {             a[i*4 +j] = i*4+j;             b[i*4 +j] = i*4+j;             c1[i*4 +j] = 0.0f;             c2[i*4 +j] = 0.0f;         }     }     m4x4t(a, b, c1);     m4x4t_vec(a, b, c2);     printf("compare %f\n", compare_4x4(c1,c2));  } 

edit:

here scalar , sse function c = (ab)^t. should fast ab versions.

void m4x4tt(const float *a, const float *b, float *c) {     for(int i=0; i<4; i++) {         for(int j=0; j<4; j++) {             float sum = 0.0f;             for(int k=0; k<4; k++) {                 sum += a[j*4+k]*b[k*4+i];             }             c[i*4 + j] = sum;         }     } }  void m4x4tt_vec(const float *a, const float *b, float *c) {     __m128 arow[4], crow[4];     for(int i=0; i<4; i++) {         arow[i] = _mm_load_ps(&a[4*i]);     }      for(int i=0; i<4; i++) {         crow[i] = _mm_set1_ps(0.0f);         for(int j=0; j<4; j++) {             __m128 = _mm_set1_ps(b[4*i +j]);             crow[i] = _mm_add_ps(crow[i], _mm_mul_ps(a, arow[j]));         }     }      for(int i=0; i<4; i++) {         _mm_store_ps(&c[4*i], crow[i]);     } } 

Comments

Popular posts from this blog

jquery - How can I dynamically add a browser tab? -

keyboard - C++ GetAsyncKeyState alternative -

android - java.net.UnknownHostException(Unable to resolve host “URL”: No address associated with hostname) -