88void bMult_AB(
const A_layout_t &A_layout,
const A_data_t &A_data,
89 const B_layout_t &B_layout,
const B_data_t &B_data,
90 const C_layout_t &C_layout, C_data_t &C_data)
92 MFEM_STATIC_ASSERT(A_layout_t::rank == 2 && B_layout_t::rank == 2 &&
93 C_layout_t::rank == 2,
"invalid ranks");
94 const int A1 = A_layout_t::dim_1;
95 const int A2 = A_layout_t::dim_2;
96 const int B1 = B_layout_t::dim_1;
97 const int B2 = B_layout_t::dim_2;
98 const int C1 = C_layout_t::dim_1;
99 const int C2 = C_layout_t::dim_2;
100 MFEM_STATIC_ASSERT(A2 == B1 && A1 == C1 && B2 == C2,
101 "invalid dimensions");
103 const int rA1 = A1%bA1;
104 const int rA2 = A2%bA2;
105 const int rB2 = B2%bB2;
107 for (
int b2_b = 0; b2_b < B2/bB2; b2_b++)
112 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
115 A_layout.template sub<bA1,bA2>(a1_b*bA1,0), A_data,
116 B_layout.template sub<bA2,bB2>(0,b2_b*bB2), B_data,
117 C_layout.template sub<bA1,bB2>(a1_b*bA1,b2_b*bB2), C_data);
122 A_layout.template sub<rA1,bA2>(A1-rA1,0), A_data,
123 B_layout.template sub<bA2,bB2>(0,b2_b*bB2), B_data,
124 C_layout.template sub<rA1,bB2>(A1-rA1,b2_b*bB2), C_data);
126 for (
int s_b = 1; s_b < A2/bA2; s_b++)
128 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
131 A_layout.template sub<bA1,bA2>(a1_b*bA1,s_b*bA2), A_data,
132 B_layout.template sub<bA2,bB2>(s_b*bA2,b2_b*bB2), B_data,
133 C_layout.template sub<bA1,bB2>(a1_b*bA1,b2_b*bB2), C_data);
138 A_layout.template sub<rA1,bA2>(A1-rA1,s_b*bA2), A_data,
139 B_layout.template sub<bA2,bB2>(s_b*bA2,b2_b*bB2), B_data,
140 C_layout.template sub<rA1,bB2>(A1-rA1,b2_b*bB2), C_data);
146 const bool rAdd =
Add || (A2/bA2 > 0);
147 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
150 A_layout.template sub<bA1,rA2>(a1_b*bA1,A2-rA2), A_data,
151 B_layout.template sub<rA2,bB2>(A2-rA2,b2_b*bB2), B_data,
152 C_layout.template sub<bA1,bB2>(a1_b*bA1,b2_b*bB2), C_data);
157 A_layout.template sub<rA1,rA2>(A1-rA1,A2-rA2), A_data,
158 B_layout.template sub<rA2,bB2>(A2-rA2,b2_b*bB2), B_data,
159 C_layout.template sub<rA1,bB2>(A1-rA1,b2_b*bB2), C_data);
168 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
171 A_layout.template sub<bA1,bA2>(a1_b*bA1,0), A_data,
172 B_layout.template sub<bA2,rB2>(0,B2-rB2), B_data,
173 C_layout.template sub<bA1,rB2>(a1_b*bA1,B2-rB2), C_data);
178 A_layout.template sub<rA1,bA2>(A1-rA1,0), A_data,
179 B_layout.template sub<bA2,rB2>(0,B2-rB2), B_data,
180 C_layout.template sub<rA1,rB2>(A1-rA1,B2-rB2), C_data);
185 for (
int s_b = 1; s_b < A2/bA2; s_b++)
187 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
190 A_layout.template sub<bA1,bA2>(a1_b*bA1,s_b*bA2), A_data,
191 B_layout.template sub<bA2,rB2>(s_b*bA2,B2-rB2), B_data,
192 C_layout.template sub<bA1,rB2>(a1_b*bA1,B2-rB2), C_data);
197 A_layout.template sub<rA1,bA2>(A1-rA1,s_b*bA2), A_data,
198 B_layout.template sub<bA2,rB2>(s_b*bA2,B2-rB2), B_data,
199 C_layout.template sub<rA1,rB2>(A1-rA1,B2-rB2), C_data);
205 const bool rAdd =
Add || (A2/bA2 > 0);
206 for (
int a1_b = 0; a1_b < A1/bA1; a1_b++)
209 A_layout.template sub<bA1,rA2>(a1_b*bA1,A2-rA2), A_data,
210 B_layout.template sub<rA2,rB2>(A2-rA2,B2-rB2), B_data,
211 C_layout.template sub<bA1,rB2>(a1_b*bA1,B2-rB2), C_data);
216 A_layout.template sub<rA1,rA2>(A1-rA1,A2-rA2), A_data,
217 B_layout.template sub<rA2,rB2>(A2-rA2,B2-rB2), B_data,
218 C_layout.template sub<rA1,rB2>(A1-rA1,B2-rB2), C_data);