28 std::ostream&
operator<<(std::ostream& o,
const tensor_ranges& r) {
31 o <<
"[0.." << r[i] <<
"]";
49 template<
typename IT>
class basic_multi_iterator {
53 tensor_strides strides;
55 index_set ilst2idxnums;
56 std::vector<const tensor_strides*> slst;
60 const tensor_ranges& getcnt()
const {
return cnt; }
61 basic_multi_iterator() {
62 N = 0; idxnums.reserve(16); strides.reserve(64);
64 ilst2idxnums.reserve(64); iter.reserve(4);
66 const tensor_ranges& all_ranges()
const {
return ranges; }
67 const index_set& all_indexes()
const {
return idxnums; }
70 void insert(
const index_set& idxs,
const tensor_ranges& r,
const tensor_strides& s, IT it_) {
71 assert(idxs.size() == r.size()); assert(s.size() == r.size()+1);
73 for (
unsigned int i=0; i < idxs.size(); ++i) {
74 index_set::const_iterator f=std::find(idxnums.begin(), idxnums.end(), idxs[i]);
75 if (f == idxnums.end()) {
76 ilst2idxnums.push_back(dim_type(idxnums.size()));
77 idxnums.push_back(idxs[i]);
78 ranges.push_back(r[i]);
80 ilst2idxnums.push_back(dim_type(f-idxnums.begin()));
81 assert(ranges[ilst2idxnums.back()] == r[i]);
89 strides.assign(N*idxnums.size(),0);
90 for (
unsigned int i=0; i < N; ++i) {
91 for (
unsigned int j=0; j < slst[i]->size()-1; ++j) {
92 stride_type s = (*slst[i])[j];
93 strides[ilst2idxnums[c]*N + i] = s;
98 for (
unsigned int i=0; i < idxnums.size(); ++i) {
99 for (
unsigned int j=0; j < N; ++j) {
100 if (strides[i*N+j]) n[j+1] = i;
103 cnt.assign(idxnums.size(),0);
108 bool next(
unsigned int b) {
return next(b,N); }
109 bool next(
unsigned int b,
unsigned int NN) {
112 if (++cnt[i0] < ranges[i0]) {
113 for (
unsigned int i=b; i < NN; ++i) {
114 iter[i] += strides[i0*NN+i];
118 for (
unsigned int i=b; i < NN; ++i) {
119 iter[i] -= strides[i0*NN+i]*(ranges[i0]-1);
126 template<
unsigned b,
unsigned NN>
bool qnext() {
return next(b,NN); }
127 IT it(
unsigned int b)
const {
return iter[b]; }
135 tensor_mask::tensor_mask(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
136 assign(tm1,tm2,and_op); unset_card();
139 void tensor_mask::assign(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
140 clear(); unset_card();
141 if (tm1.ndim()==0) { assign(tm2);
return; }
142 if (tm2.ndim()==0) { assign(tm1);
return; }
144 idxs = tm1.indexes();
149 tensor_ranges global_r(std::max(tm1.max_dim(),tm2.max_dim())+1, index_type(1));
151 for (index_type i = 0; i < tm1.indexes().size(); ++i)
152 global_r[tm1.indexes()[i]] = tm1.ranges()[i];
155 for (index_type i = 0; i < tm2.indexes().size(); ++i) {
156 index_set::const_iterator f=std::find(tm1.indexes().begin(), tm1.indexes().end(), tm2.indexes()[i]);
157 if (f == tm1.indexes().end()) {
158 assert(global_r[tm2.indexes()[i]]==1);
159 global_r[tm2.indexes()[i]] = tm2.ranges()[i];
160 r.push_back(tm2.ranges()[i]);
161 idxs.push_back(tm2.indexes()[i]);
163 else assert(global_r[tm2.indexes()[i]] = tm2.ranges()[i]);
167 m.assign(size(),
false);
169 for (tensor_ranges_loop l(global_r); !l.finished(); l.next()) {
171 if (tm1(l.cnt) && tm2(l.cnt)) m.add(pos(l.cnt));
173 if (tm1(l.cnt) || tm2(l.cnt)) m.add(pos(l.cnt));
181 void tensor_mask::assign(
const tensor_mask& tm1,
const tensor_mask& tm2,
bool and_op) {
182 clear(); unset_card();
185 if (tm1.ndim()==0) { assign(tm2);
return; }
186 if (tm2.ndim()==0) { assign(tm1);
return; }
189 if (tm1.indexes() == tm2.indexes() &&
190 tm1.ranges() == tm2.ranges() &&
191 tm1.strides() == tm2.strides()) {
192 r = tm1.ranges(); idxs=tm1.indexes(); s=tm1.strides();
193 assert(tm1.m.size() == tm2.m.size());
196 for (index_type i=0; i < tm2.m.size(); ++i)
197 if (!tm2.m[i]) m[i] =
false;
199 for (index_type i=0; i < tm2.m.size(); ++i)
200 if (tm2.m[i]) m[i] =
true;
205 basic_multi_iterator<unsigned> bmit;
206 bmit.insert(tm1.indexes(), tm1.ranges(), tm1.strides(), 0);
207 bmit.insert(tm2.indexes(), tm2.ranges(), tm2.strides(), 0);
208 r = bmit.all_ranges(); idxs = bmit.all_indexes(); eval_strides();
210 m.assign(size(),
false);
211 bmit.insert(indexes(), ranges(), strides(), 0);
216 if (tm1.m[bmit.it(0)]) {
218 if (tm2.m[bmit.it(1)]) {
223 }
while (bmit.qnext<1,3>());
225 }
while (bmit.qnext<0,3>());
228 bool v1 = tm1.m[bmit.it(0)];
230 if (v1 || (tm2.m[bmit.it(1)]))
232 }
while (bmit.qnext<1,3>());
233 }
while (bmit.qnext<0,3>());
241 tensor_mask::tensor_mask(
const std::vector<const tensor_mask*>& tm) {
246 void tensor_mask::assign(
const std::vector<const tensor_mask*> & tm) {
247 index_set mask_start; unset_card();
249 r.reserve(255); idxs.reserve(255); mask_start.reserve(255);
250 for (dim_type i=0; i < tm.size(); ++i) {
251 mask_start.push_back(r.size());
252 for (dim_type j=0; j < tm[i]->ndim(); ++j) {
253 r.push_back(tm[i]->ranges()[j]);
254 idxs.push_back(tm[i]->indexes()[j]);
257 eval_strides(); assert(size());
258 m.assign(size(),
false);
259 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
261 for (dim_type i=0; is_in && i < tm.size(); ++i) {
263 for (dim_type j=0; j < tm[i]->ndim(); ++j)
264 s_ += l.cnt[j+mask_start[i]]*tm[i]->strides()[j];
265 if (!tm[i]->m[s_]) is_in =
false;
267 if (is_in) m.add(lpos(l.cnt));
272 void tensor_mask::assign(
const std::vector<const tensor_mask*> & tm) {
274 if (tm.size() == 0) {
clear();
return; }
275 if (tm.size() == 1) { assign(*tm[0]);
return; }
277 basic_multi_iterator<unsigned> bmit;
278 for (dim_type i=0; i < tm.size(); ++i)
279 bmit.insert(tm[i]->indexes(), tm[i]->ranges(), tm[i]->strides(), 0);
280 r = bmit.all_ranges(); idxs = bmit.all_indexes(); eval_strides();
282 m.assign(size(),
false);
283 bmit.insert(indexes(), ranges(), strides(), 0);
285 unsigned N = unsigned(tm.size());
286 bool finished =
false;
290 for (b=0; b < N; ++b) {
291 if (!tm[b]->m[bmit.it(b)]) {
292 is_in =
false;
break;
295 if (is_in) { m[bmit.it(N)] = 1; b = N-1; }
296 while (!finished && !bmit.next(b)) {
if (b == 0) finished =
true; b--; }
300 void tensor_mask::unpack_strides(
const tensor_strides& packed, tensor_strides& unpacked)
const {
301 if (packed.size() != card())
302 assert(packed.size()==card());
303 unpacked.assign(size(),INT_MIN);
305 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
306 if (m[lpos(l.cnt)]) unpacked[lpos(l.cnt)] = packed[i++];
310 void tensor_mask::check_assertions()
const {
311 GMM_ASSERT3(r.size() == idxs.size(),
"");
312 GMM_ASSERT3(s.size() == idxs.size()+1,
"");
313 GMM_ASSERT3(m.size() == size(),
"");
315 for (dim_type i=0; i < idxs.size(); ++i) {
316 GMM_ASSERT3(!bv.is_in(i),
"");
321 tensor_mask::tensor_mask(
const std::vector<const tensor_mask*> tm1,
const std::vector<const tensor_mask*> tm2,
bool and_op) {
322 assign(tensor_mask(tm1), tensor_mask(tm2), and_op);
325 void tensor_ref::set_sub_tensor(
const tensor_ref& tr,
const tensor_shape& sub) {
331 strides_.resize(masks().size());
332 for (dim_type i = 0; i < strides_.size(); ++i)
333 strides_[i].
resize(mask(i).card());
335 pbase_ = tr.pbase_; base_shift_ = tr.base_shift();
343 std::vector<tensor_strides> trstrides_unpacked(tr.masks().size());
344 for (dim_type im = 0; im < tr.masks().size(); ++im) {
345 tr.mask(im).check_assertions();
346 tr.mask(im).unpack_strides(tr.strides()[im], trstrides_unpacked[im]);
351 for (dim_type im = 0; im < masks().size(); ++im) {
352 const tensor_mask& m = masks()[im];
358 std::vector<dim_type> trmasks; trmasks.reserve(tr.masks().size());
359 for (dim_type i=0; i < m.indexes().size(); ++i) {
360 if (tr.index_is_valid(m.indexes()[i])) {
361 dim_type im2 = tr.index_to_mask_num(m.indexes()[i]);
362 if (std::find(trmasks.begin(), trmasks.end(), im2)==trmasks.end()) trmasks.push_back(im2);
366 tensor_ranges gcnt(tr.ndim(),0);
367 stride_type stcnt = 0;
369 for (tensor_ranges_loop l(m.ranges()); !l.finished(); l.next()) {
371 for (dim_type i=0; i < m.ranges().size(); ++i) gcnt[m.indexes()[i]] = l.index(i);
375 stride_type tr_s = 0;
379 for (dim_type i=0; i < trmasks.size(); ++i) {
380 const tensor_mask &mm = tr.mask(trmasks[i]);
384 tr_s += trstrides_unpacked[trmasks[i]][mm.pos(gcnt)];
385 assert(trstrides_unpacked[trmasks[i]][mm.pos(gcnt)]>=0);
386 }
else { in_trm =
false;
break; }
389 if (in_m) assert(in_trm);
390 if (!in_trm) assert(!in_m);
394 strides_[im][stcnt++] = tr_s;
399 assert(stcnt == stride_type(m.card()));
407 tensor_ref::tensor_ref(
const tensor_ref& tr, tensor_mask::Slice slice) {
408 set_sub_tensor(tr, tr.slice_shape(slice));
414 const tensor_mask& m1(index_to_mask(slice.dim));
415 dim_type mdim = index_to_mask_dim(slice.dim);
417 tensor_ranges r(m1.ranges()); r.erase(r.begin()+mdim);
418 index_set idx(m1.indexes()); idx.erase(idx.begin()+mdim);
419 tensor_mask m2(r,idx);
420 index_type pos1 = 0, pos2 = 0;
421 for (tensor_ranges_loop l(m1.ranges()); !l.finished(); l.next()) {
422 if (l.index(mdim) == slice.i0) {
423 m2.set_mask_val(pos2++, m1(pos1));
424 }
else assert(m1(pos1) == 0);
430 assert(index_to_mask_num(slice.dim) < masks().size());
431 masks()[index_to_mask_num(slice.dim)] = m2;
434 remove_mask(index_to_mask_num(slice.dim));
437 shift_dim_num_ge(slice.dim,-1);
438 set_ndim_noclean(dim_type(ndim()-1));
443 struct compare_packed_range {
444 const std::vector<packed_range_info>& pri;
445 std::vector<stride_type> mean_inc;
446 compare_packed_range(
const std::vector<packed_range_info>& pri_) : pri(pri_) {}
447 bool operator()(dim_type a, dim_type b) {
448 if (pri[a].n < pri[b].n)
return true;
449 else if (pri[a].n > pri[b].n)
return false;
451 if (pri[a].mean_increm > pri[b].mean_increm)
return true;
457 void multi_tensor_iterator::init(std::vector<tensor_ref> trtab,
bool with_index_values) {
458 N = index_type(trtab.size());
459 index_type N_packed_idx = 0;
463 tensor_shape ts(trtab[0]);
464 for (dim_type i=1; i < trtab.size(); ++i)
470 for (dim_type i = 0; i < N; ++i) {
471 tensor_ref tmp = trtab[i];
472 trtab[i].set_sub_tensor(tmp,ts);
476 dal::bit_vector packed_idx; packed_idx.sup(0,ts.ndim());
477 for (index_type mi=0; mi < ts.masks().size(); ++mi) {
478 if (ts.masks()[mi].card() != 1) {
483 for (dim_type j=0; j < N; ++j) {
484 if (trtab[j].strides()[mi].size() != 0) {
485 assert(trtab[j].strides()[mi].size() == 1);
486 assert(trtab[j].strides()[mi][0] == 0);
492 pr.resize(N_packed_idx);
493 pri.resize(N_packed_idx);
498 for (dim_type mi = 0; mi < ts.masks().size(); ++mi) {
499 if (packed_idx[mi]) {
501 pri[pmi].original_masknum = mi;
502 pri[pmi].range = ts.masks()[mi].card();
503 for (n = 0; n < N; n++)
504 if (trtab[n].index_is_valid(mi))
break;
505 pri[pmi].n = n; pr[pmi].n = n;
511 std::sort(pri.begin(), pri.end());
515 bloc_rank.resize(N+1); bloc_rank[0] = 0;
516 bloc_nelt.resize(N+1); bloc_nelt[0] = 0;
517 for (index_type i=1; i <= N; ++i) {
518 bloc_rank[i] = bloc_rank[i-1];
519 while (bloc_rank[i] < pri.size() && pri[bloc_rank[i]].n == i-1) bloc_rank[i]++;
520 bloc_nelt[i] = bloc_rank[i] - bloc_rank[i-1];
524 for (pmi = 0; pmi < pri.size(); ++pmi) {
525 index_type mi = pri[pmi].original_masknum;
526 pri[pmi].mean_increm = 0;
527 pri[pmi].inc.assign(pri[pmi].range*(N-pri[pmi].n), 0);
528 pri[pmi].have_regular_strides.reset();
529 pri[pmi].have_regular_strides = std::bitset<32>((1 << N)-1);
530 for (dim_type n=pri[pmi].n; n < N; ++n) {
531 index_type pos0 = (n - pri[pmi].n);
532 for (index_type i = 0; i < pri[pmi].range; ++i) {
533 index_type pos = i * (N-pri[pmi].n) + pos0;
534 if (i != pri[pmi].range-1) {
535 stride_type increm = trtab[n].strides()[mi][i+1] - trtab[n].strides()[mi][i];
536 pri[pmi].inc[pos] = increm;
537 if (pri[pmi].inc[pos] != pri[pmi].inc[pos0])
538 pri[pmi].have_regular_strides[n] =
false;
539 pri[pmi].mean_increm += increm;
540 }
else { pri[pmi].inc[pos] = -trtab[n].strides()[mi][i]; }
544 pri[pmi].mean_increm /= ((N-pri[pmi].n)*(pri[pmi].range-1));
548 index_set pr_reorder(pri.size());
549 for (pmi = 0; pmi < pri.size(); ++pmi) {
550 pr_reorder[pmi]=dim_type(pmi);
552 std::sort(pr_reorder.begin(), pr_reorder.end(), compare_packed_range(pri));
554 std::vector<packed_range> tmppr(pr);
555 std::vector<packed_range_info> tmppri(pri);
556 for (dim_type i =0; i < pri.size(); ++i) {
557 pr[i] = tmppr [pr_reorder[i]];
558 pri[i] = tmppri[pr_reorder[i]];
563 if (with_index_values) {
564 idxval.resize(ts.ndim());
566 for (dim_type i=0; i < ts.ndim(); ++i) {
567 idxval[i].ppinc = NULL; idxval[i].pposbase = NULL; idxval[i].pos_ = 0;
570 for (index_type mi = 0; mi < ts.masks().size(); ++mi) {
572 pmi = index_type(-1);
573 for (dim_type i=0; i < pr.size(); ++i)
574 if (pri[i].original_masknum == mi) pmi = i;
576 if (pmi != index_type(-1)) {
577 ts.masks()[mi].gen_mask_pos(pri[pmi].mask_pos);
579 ts.masks()[mi].gen_mask_pos(v); assert(v.size()==1);
581 for (dim_type i=0; i < ts.masks()[mi].indexes().size(); ++i) {
582 dim_type ii = ts.masks()[mi].indexes()[i];
583 idxval[ii].cnt_num = dim_type(pmi);
584 idxval[ii].pos_ = (pmi != index_type(-1)) ? 0 : v[0];
585 idxval[ii].mod = ts.masks()[mi].strides()[i+1];
586 idxval[ii].div = ts.masks()[mi].strides()[i];
594 vectorized_strides_.resize(N); vectorized_size_ = 0;
595 std::fill(vectorized_strides_.begin(), vectorized_strides_.end(), 0);
596 vectorized_pr_dim = index_type(pri.size());
597 for (vectorized_pr_dim = index_type(pri.size()-1); vectorized_pr_dim != index_type(-1); vectorized_pr_dim--) {
598 std::vector<packed_range_info>::const_iterator pp = pri.begin() + vectorized_pr_dim;
599 if (vectorized_pr_dim == pri.size()-1) {
600 if (pp->have_regular_strides.count() == N) vectorized_size_ = pp->range;
601 for (dim_type n=pp->n; n < N; ++n) {
602 GMM_ASSERT3(n < pp->inc.size(),
"");
603 vectorized_strides_[n] = pp->inc[n];
606 if (pp->have_regular_strides.count() != N)
break;
607 bool still_ok =
true;
608 for (dim_type n=pp->n; n < N; ++n) {
609 if (stride_type(vectorized_strides_[n]*vectorized_size_) != pp->inc[n]) still_ok =
false;
612 vectorized_size_ *= pp->range;
617 it.resize(N); pit0.resize(N); itbase.resize(N);
618 for (dim_type n=0; n < N; ++n) {
619 pit0[n]=trtab[n].pbase();
620 itbase[n]=trtab[n].base_shift();
625 void multi_tensor_iterator::print()
const {
626 cout <<
"MTI(N=" << N <<
"): ";
627 for (dim_type i=0; i < pr.size(); ++i)
628 cout <<
" pri[" <<
int(i) <<
"]: n=" << int(pri[i].n)
629 <<
", range=" << pri[i].range <<
", mean_increm="
630 << pri[i].mean_increm <<
", regular = " << pri[i].have_regular_strides
631 <<
", inc=" << vref(pri[i].inc) <<
"\n";
632 cout <<
"bloc_rank: " << vref(bloc_rank) <<
", bloc_nelt: " << vref(bloc_nelt) <<
"\n";
633 cout <<
"vectorized_size : " << vectorized_size_ <<
", strides = " << vref(vectorized_strides_) <<
", pr_dim=" << vectorized_pr_dim <<
"\n";
636 void tensor_reduction::insert(
const tensor_ref& tr_,
const std::string& s) {
637 tensor_shape ts(tr_);
639 trtab.push_back(tref_or_reduction(tensor_ref(tr_, ts), s));
646 void tensor_reduction::insert(
const tref_or_reduction& t,
const std::string& s) {
647 if (!t.is_reduction()) {
650 trtab.push_back(t); trtab.back().ridx = s;
660 void tensor_reduction::update_reduction_chars() {
661 reduction_chars.clear();
662 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
663 assert(it->ridx.size() == it->tr().ndim());
664 for (
unsigned i =0; i < it->ridx.size(); ++i) {
665 if (it->ridx[i] !=
' ' &&
666 reduction_chars.find(it->ridx[i]) == std::string::npos)
667 reduction_chars.push_back(it->ridx[i]);
673 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
674 it->gdim.resize(it->ridx.size());
675 for (
unsigned i =0; i < it->ridx.size(); ++i) {
676 char c = it->ridx[i];
677 if (c !=
' ' && it->ridx.find(c) != i) {
678 for (c =
'A'; c <=
'Z'; ++c)
679 if (reduction_chars.find(c) == std::string::npos)
break;
681 reduction_chars.push_back(it->ridx[i]);
690 void tensor_reduction::pre_prepare() {
691 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
692 assert(it->ridx.size() == it->tr().ndim());
693 it->rdim.resize(it->ridx.size());
695 for (dim_type i =0; i < it->ridx.size(); ++i) {
696 if (it->ridx[i] ==
' ') {
697 reduced_range.push_back(it->tr().dim(i));
698 it->rdim[i] = dim_type(reduced_range.size()-1);
699 }
else it->rdim[i] = dim_type(-1);
706 size_type tensor_reduction::find_best_sub_reduction(dal::bit_vector &best_lst, std::string &best_idxset) {
709 best_lst.clear(); best_idxset.clear();
711 update_reduction_chars();
714 GMM_ASSERT2(trtab.size() <= 32,
"wow it was assumed that nobody would "
715 "ever need a reduction on more than 32 tensors..");
717 std::vector<std::bitset<32> > idx_occurences(reduction_chars.size());
719 for (
unsigned ir=0; ir < reduction_chars.size(); ++ir) {
720 char c = reduction_chars[ir];
721 for (
unsigned tnum=0; tnum < trtab.size(); ++tnum)
722 idx_occurences[ir][tnum] = (trtab[tnum].ridx.find(c) != std::string::npos);
726 for (
unsigned ir=0; ir < reduction_chars.size(); ++ir) {
727 lst.clear(); lst.add(ir);
728 idxset.resize(0); idxset.push_back(reduction_chars[ir]);
730 for (
unsigned ir2=0; ir2 < reduction_chars.size(); ++ir2) {
732 if ((idx_occurences[ir2] | idx_occurences[ir]) == idx_occurences[ir]) {
734 idxset.push_back(reduction_chars[ir2]);
740 for (
unsigned tnum=0; tnum < trtab.size(); ++tnum) {
741 if (!idx_occurences[ir][tnum])
continue;
742 std::bitset<int(32)> once((
int)reduction_chars.size());
743 for (dim_type i=0; i < trtab[tnum].tr().ndim(); ++i) {
745 for (dal::bv_visitor j(lst); !j.finished(); ++j) {
746 if (trtab[tnum].ridx[i] == reduction_chars[(
size_t)j]) {
747 if (once[j]) ignore =
true;
else once[j] =
true;
751 redsz *= trtab[tnum].tr().dim(i);
755 if (redsz < best_redsz) {
758 for (
unsigned i=0; i < trtab.size(); ++i)
759 if (idx_occurences[ir][i]) best_lst.add(i);
760 best_idxset = idxset;
770 void tensor_reduction::make_sub_reductions() {
771 dal::bit_vector bv; std::string red;
774 find_best_sub_reduction(bv,red);
775 if (bv.card() < trtab.size() && red.size()) {
777 auto sub = std::make_shared<tensor_reduction>();
778 std::vector<dim_type> new_rdim; new_rdim.reserve(16);
779 std::string new_reduction;
780 for (dal::bv_visitor tnum(bv); !tnum.finished(); ++tnum) {
781 tref_or_reduction &t = trtab[tnum];
782 std::string re = t.ridx; t.ridx.clear();
783 for (
unsigned i = 0; i < re.size(); ++i) {
784 bool reduced =
false;
787 if (red.find(re[i]) == std::string::npos) c =
' ';
791 t.ridx.push_back(re[i]);
792 new_rdim.push_back(t.rdim[i]);
793 new_reduction.push_back(re[i]);
798 sub->insert(trtab[tnum], re);
804 trtab[bv.first_true()] = tref_or_reduction(sub, new_reduction);
805 trtab[bv.first_true()].rdim = new_rdim;
806 std::vector<tref_or_reduction> trtab2; trtab2.reserve(trtab.size() - bv.card() + 1);
807 for (
size_type i=0; i < trtab.size(); ++i)
808 if (!bv.is_in(i) || i == bv.first_true())
809 trtab2.push_back(trtab[i]);
823 void tensor_reduction::prepare(
const tensor_ref* tr_out) {
825 make_sub_reductions();
828 if (tr_out == NULL) {
829 trres = tensor_ref(reduced_range);
830 out_data.resize(trres.card());
831 pout_data = &out_data[0];
832 trres.set_base(pout_data);
834 GMM_ASSERT3(tr_out->ndim() == reduced_range.size(),
"");
835 for (dim_type i=0; i < tr_out->ndim(); ++i)
836 GMM_ASSERT3(tr_out->dim(i) == reduced_range[i],
"");
844 tensor_ranges global_range;
846 std::string global_chars;
848 global_range.reserve(16);
849 global_range.assign(reduced_range.begin(), reduced_range.end());
850 global_chars.insert(
size_type(0), reduced_range.size(),
' ');
851 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
852 assert(it->rdim.size() == it->tr().ndim());
854 for (dim_type i=0; i < it->ridx.size(); ++i) {
855 if (it->rdim[i] == dim_type(-1)) {
856 assert(it->ridx[i] !=
' ');
858 if (p == std::string::npos) {
859 global_chars.push_back(it->ridx[i]);
860 global_range.push_back(it->tr().dim(i));
861 it->gdim[i] = dim_type(global_range.size() - 1);
863 GMM_ASSERT1(it->tr().dim(i) == global_range[p],
864 "inconsistent dimensions for reduction index "
865 << it->ridx[i] <<
"(" <<
int(it->tr().dim(i))
866 <<
" != " <<
int(global_range[p]) <<
")");
867 it->gdim[i] = dim_type(p);
875 std::vector<dim_type> reorder(global_chars.size(), dim_type(-1));
877 for (dim_type i=0; i < reduced_range.size(); ++i) reorder[i] = i;
879 trres.permute(reorder);
880 std::vector<tensor_ref> tt; tt.reserve(trtab.size()+1);
884 for (trtab_iterator it = trtab.begin(); it != trtab.end(); ++it) {
885 std::fill(reorder.begin(), reorder.end(), dim_type(-1));
886 for (dim_type i=0; i < it->gdim.size(); ++i) {
887 reorder[it->gdim[i]] = i;
890 it->tr().permute(reorder);
891 tt.push_back(it->tr());
899 static void do_reduction1(bgeot::multi_tensor_iterator &mti) {
904 }
while (mti.bnext(1));
906 }
while (mti.bnext(0));
909 static bool do_reduction2v(bgeot::multi_tensor_iterator &mti) {
910 BLAS_INT n = mti.vectorized_size();
911 const std::vector<stride_type> &s = mti.vectorized_strides();
912 if (n && s[0] && s[1] && s[2] == 0) {
913 BLAS_INT incx = s[1], incy = s[0];
920 gmm::daxpy_(&n, &mti.p(2), &mti.p(1), &incx, &mti.p(0), &incy);
921 }
while (mti.vnext());
925 static void do_reduction2a(bgeot::multi_tensor_iterator &mti) {
926 if (!do_reduction2v(mti)) {
928 mti.p(0) += mti.p(1)*mti.p(2);
929 }
while (mti.bnext(0));
933 static void do_reduction2b(bgeot::multi_tensor_iterator &mti) {
940 }
while (mti.bnext(2));
942 }
while (mti.bnext(1));
944 }
while (mti.bnext(0));
947 static bool do_reduction3v(bgeot::multi_tensor_iterator &mti) {
948 BLAS_INT n = mti.vectorized_size();
949 const std::vector<stride_type> &s = mti.vectorized_strides();
950 if (n && s[0] && s[1] && s[2] == 0 && s[3] == 0) {
951 BLAS_INT incx = s[1], incy = s[0];
953 double v = mti.p(2)*mti.p(3);
954 gmm::daxpy_(&n, &v, &mti.p(1), &incx, &mti.p(0), &incy);
955 }
while (mti.vnext());
960 static void do_reduction3a(bgeot::multi_tensor_iterator &mti) {
961 if (!do_reduction3v(mti)) {
963 mti.p(0) += mti.p(1)*mti.p(2)*mti.p(3);
964 }
while (mti.bnext(0));
968 static void do_reduction3b(bgeot::multi_tensor_iterator &mti) {
977 }
while (mti.bnext(3));
979 }
while (mti.bnext(2));
981 }
while (mti.bnext(1));
983 }
while (mti.bnext(0));
986 void tensor_reduction::do_reduction() {
992 if (out_data.size()) memset(&out_data[0], 0, out_data.size()*
sizeof(out_data[0]));
993 for (
unsigned i=0; i < trtab.size(); ++i) {
994 if (trtab[i].is_reduction()) {
995 trtab[i].reduction->do_reduction();
996 trtab[i].reduction->result(trtab[i].tr());
1001 dim_type N = dim_type(trtab.size());
1004 }
else if (N == 2) {
1005 if (!mti.bnext_useful(2) && !mti.bnext_useful(1)) {
1006 do_reduction2a(mti);
1008 do_reduction2b(mti);
1010 }
else if (N == 3) {
1011 if (!mti.bnext_useful(1) && (!mti.bnext_useful(2)) && !mti.bnext_useful(3)) {
1012 do_reduction3a(mti);
1014 do_reduction3b(mti);
1016 }
else if (N == 4) {
1027 }
while (mti.bnext(4));
1029 }
while (mti.bnext(3));
1031 }
while (mti.bnext(2));
1033 }
while (mti.bnext(1));
1035 }
while (mti.bnext(0));
1037 GMM_ASSERT1(
false,
"unhandled reduction case ! (N=" <<
int(N) <<
")");
1041 void tensor_reduction::clear() {
1044 reduced_range.resize(0);
1045 reduction_chars.clear();
1048 pout_data = 0; trtab.reserve(10);
1053 void tensor_mask::print(std::ostream &o)
const {
1054 index_type c=card(
true);
1056 o <<
" mask : card=" << c <<
"(card_=" << card_ <<
", uptodate=" << card_uptodate <<
"), indexes=";
1057 for (dim_type i=0; i < idxs.size(); ++i)
1058 o << (i==0?
"":
", ") << int(idxs[i]) <<
":" << int(r[i]);
1060 if (c == size()) o <<
" FULL" << endl;
1063 if (idxs.size() == 1) {
1064 for (index_type i=0; i < m.size(); ++i) o << (m[i] ? 1 : 0);
1066 for (tensor_ranges_loop l(r); !l.finished(); l.next()) {
1067 if (l.cnt[0] == 0 && l.cnt[1] == 0 && r.size()>2) {
1069 for (dim_type i=2; i < r.size(); ++i) o <<
"," << l.cnt[i];
1072 o << (m[lpos(l.cnt)] ? 1 : 0);
1073 if (l.cnt[0] == r[0]-1) {
1074 if (l.cnt[1] != r[1]-1) o <<
",";
1075 else if (idxs.size() > 2) o <<
"}";
1085 void tensor_shape::print(std::ostream& o)
const {
1086 o <<
" tensor_shape: n=" << idx2mask.size() <<
", idx2mask=";
1087 for (dim_type i=0; i < idx2mask.size(); ++i) {
1089 if (idx2mask[i].is_valid()) {
1090 o <<
"r" << dim(i) <<
":m" << int(idx2mask[i].mask_num) <<
"/" << int(idx2mask[i].mask_dim);
1091 }
else o <<
" (na) ";
1095 for (dim_type i=0; i < masks_.size(); ++i) o << masks_[i];
1096 o <<
" ^-- end tensor_shape" << endl;
1099 void tensor_ref::print(std::ostream& o)
const {
1100 o <<
"tensor_ref, n=" << int(ndim()) <<
", card=" << card(
true) <<
", base=" << base() << endl;
1101 for (dim_type i=0; i < strides().size(); ++i) {
1102 o <<
" * strides["<<i<<
"]={";
1103 for (
size_type j=0; j < strides()[i].size(); ++j) o << (j>0?
",":
"") << strides()[i][j];
1106 multi_tensor_iterator mti(*
this,
true);
1108 for (dim_type i = 0; i < mti.ndim(); ++i) {
1109 o << (i==0?
"(":
",");
1110 if (index_is_valid(i))
1116 o <<
" = " << mti.p(0) <<
"\t@base+" << &mti.p(0) - base();
1117 }
else o <<
"\t@" << size_t(&mti.p(0))/
sizeof(scalar_type);
1119 }
while (mti.qnext1());
1121 o <<
"^---- end tensor_ref" << endl;
1124 std::ostream&
operator<<(std::ostream& o,
const tensor_mask& m) {
1125 m.print(o);
return o;
1127 std::ostream&
operator<<(std::ostream& o,
const tensor_shape& ts) {
1128 ts.print(o);
return o;
1130 std::ostream&
operator<<(std::ostream& o,
const tensor_ref& tr) {
1131 tr.print(o);
return o;
1133 void print_tm(
const tensor_mask& tm) { tm.print_(); }
1134 void print_ts(
const tensor_shape& ts) { ts.print_(); }
1135 void print_tr(
const tensor_ref& tr) { tr.print_(); }