89void bMult_AB(
const A_layout_t &A_layout,
const A_data_t &A_data,
90 const B_layout_t &B_layout,
const B_data_t &B_data,
91 const C_layout_t &C_layout, C_data_t &C_data)
93 MFEM_STATIC_ASSERT(A_layout_t::rank == 2 && B_layout_t::rank == 2 &&
94 C_layout_t::rank == 2,
"invalid ranks");
95 const int A1 = A_layout_t::dim_1;
96 const int A2 = A_layout_t::dim_2;
97 const int B1 = B_layout_t::dim_1;
98 const int B2 = B_layout_t::dim_2;
99 const int C1 = C_layout_t::dim_1;
100 const int C2 = C_layout_t::dim_2;
101 MFEM_STATIC_ASSERT(A2 == B1 && A1 == C1 && B2 == C2,
102 "invalid dimensions");
104 const int rA1 = A1%bA1;
105 const int rA2 = A2%bA2;
106 const int rB2 = B2%bB2;
108 for (
int b2_b = 0; b2_b < B2/bB2; b2_b++)
113 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
116 A_layout.template sub<bA1,bA2>(a1_b*bA1,0), A_data,
117 B_layout.template sub<bA2,bB2>(0,b2_b*bB2), B_data,
118 C_layout.template sub<bA1,bB2>(a1_b*bA1,b2_b*bB2), C_data);
123 A_layout.template sub<rA1,bA2>(A1-rA1,0), A_data,
124 B_layout.template sub<bA2,bB2>(0,b2_b*bB2), B_data,
125 C_layout.template sub<rA1,bB2>(A1-rA1,b2_b*bB2), C_data);
127 for (
int s_b = 1; s_b < A2/bA2; s_b++)
129 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
132 A_layout.template sub<bA1,bA2>(a1_b*bA1,s_b*bA2), A_data,
133 B_layout.template sub<bA2,bB2>(s_b*bA2,b2_b*bB2), B_data,
134 C_layout.template sub<bA1,bB2>(a1_b*bA1,b2_b*bB2), C_data);
139 A_layout.template sub<rA1,bA2>(A1-rA1,s_b*bA2), A_data,
140 B_layout.template sub<bA2,bB2>(s_b*bA2,b2_b*bB2), B_data,
141 C_layout.template sub<rA1,bB2>(A1-rA1,b2_b*bB2), C_data);
147 const bool rAdd =
Add || (A2/bA2 > 0);
148 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
151 A_layout.template sub<bA1,rA2>(a1_b*bA1,A2-rA2), A_data,
152 B_layout.template sub<rA2,bB2>(A2-rA2,b2_b*bB2), B_data,
153 C_layout.template sub<bA1,bB2>(a1_b*bA1,b2_b*bB2), C_data);
158 A_layout.template sub<rA1,rA2>(A1-rA1,A2-rA2), A_data,
159 B_layout.template sub<rA2,bB2>(A2-rA2,b2_b*bB2), B_data,
160 C_layout.template sub<rA1,bB2>(A1-rA1,b2_b*bB2), C_data);
169 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
172 A_layout.template sub<bA1,bA2>(a1_b*bA1,0), A_data,
173 B_layout.template sub<bA2,rB2>(0,B2-rB2), B_data,
174 C_layout.template sub<bA1,rB2>(a1_b*bA1,B2-rB2), C_data);
179 A_layout.template sub<rA1,bA2>(A1-rA1,0), A_data,
180 B_layout.template sub<bA2,rB2>(0,B2-rB2), B_data,
181 C_layout.template sub<rA1,rB2>(A1-rA1,B2-rB2), C_data);
186 for (
int s_b = 1; s_b < A2/bA2; s_b++)
188 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
191 A_layout.template sub<bA1,bA2>(a1_b*bA1,s_b*bA2), A_data,
192 B_layout.template sub<bA2,rB2>(s_b*bA2,B2-rB2), B_data,
193 C_layout.template sub<bA1,rB2>(a1_b*bA1,B2-rB2), C_data);
198 A_layout.template sub<rA1,bA2>(A1-rA1,s_b*bA2), A_data,
199 B_layout.template sub<bA2,rB2>(s_b*bA2,B2-rB2), B_data,
200 C_layout.template sub<rA1,rB2>(A1-rA1,B2-rB2), C_data);
206 const bool rAdd =
Add || (A2/bA2 > 0);
207 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
210 A_layout.template sub<bA1,rA2>(a1_b*bA1,A2-rA2), A_data,
211 B_layout.template sub<rA2,rB2>(A2-rA2,B2-rB2), B_data,
212 C_layout.template sub<bA1,rB2>(a1_b*bA1,B2-rB2), C_data);
217 A_layout.template sub<rA1,rA2>(A1-rA1,A2-rA2), A_data,
218 B_layout.template sub<rA2,rB2>(A2-rA2,B2-rB2), B_data,
219 C_layout.template sub<rA1,rB2>(A1-rA1,B2-rB2), C_data);