c++ - Why is data type having an effect on performance in this particular case? -
i've written following code benchmarking effect of cache misses on performance:
#include <chrono> #include <cstdint> #include <cstring> #include <iostream> // avoiding using power of 2 because of possible performance degradation due cache associativity? static const size_t row_size = 600; static const size_t col_size = 600; static const size_t test_count = 50; #define same_types 1 #define init_to_one 0 #if same_types #define ary_type uint64_t #define ret_type uint64_t #else #define ary_type uint32_t #define ret_type uint64_t #endif ret_type sum_array_rows(const ary_type *a, const size_t step) { ret_type sum = 0; (size_t row = 0; row < row_size; row++) (size_t col = 0; col < col_size; col++) sum += static_cast<ret_type>(a[row * step + col]); return sum; } ret_type sum_array_cols(const ary_type *a, const size_t step) { ret_type sum = 0; (size_t col = 0; col < col_size; col++) (size_t row = 0; row < row_size; row++) sum += static_cast<ret_type>(a[row * step + col]); return sum; } int main() { #if init_to_one ary_type *a = new ary_type[row_size * col_size]; (int = 0; < row_size * col_size; i++) a[i] = 1; #else ary_type *a = new ary_type[row_size * col_size](); #endif std::chrono::high_resolution_clock hrc; std::cout << "same_types: " << same_types << "\n"; std::cout << "init_to_one: " << init_to_one << "\n"; std::cout << "row_size: " << row_size << "\n"; std::cout << "col_size: " << col_size << "\n\n"; { ret_type sum = 0; auto start = hrc.now(); (int = 0; < test_count; i++) sum = sum_array_rows(a, col_size); auto end = hrc.now(); std::cout << "time taken: " << (end - start).count() / test_count << "\n"; std::cout << "sum: " << sum << "\n"; } // i've added because want trash cache // i'm not sure if necessary or if it's doing good... ary_type *other = new ary_type[row_size * col_size]; (int = 0; < row_size * col_size; i++) other[i] = 1; { ret_type sum = other[row_size] - 1; auto start = hrc.now(); (int = 0; < test_count; i++) sum = sum_array_cols(a, col_size); auto end = hrc.now(); std::cout << "time taken: " << (end - start).count() / test_count << "\n"; std::cout << "sum: " << sum << "\n"; } return 0; }
i have 2 functions sum_array_rows
, sum_array_cols
take in array , add elements, , return sum. difference order in access elements. return type of both functions uint64_t
.
i have define near top of file called same_types
. if same_types
both return type , type of elements uint64_t
. if not same_types
type of elements uint32_t
.
so running code gives me...
with same_types
set 1:
same_types: 1 init_to_one: 0 row_size: 600 col_size: 600 time taken: 80948 (element type uint64_t, row first) sum: 0 time taken: 566241 (element type uint64_t, col first) sum: 0
with same_types
set 0:
same_types: 0 init_to_one: 0 row_size: 600 col_size: 600 time taken: 283348 (element type uint32_t, row first) sum: 0 time taken: 369653 (element type uint32_t, col first) sum: 0
i'm confused why having effect on performance. when data type same result seems expected, row-col faster col-row. why, when types different, see decrease in row-col , increase in col-row. understand if data type of element larger can fit less cache, why seeing increase in performance when iterating on columns in outer loop. can please explain me?
in case matters, i'm using vs 2015 compiling , cpu i5-4460 (running windows 10).
edit: guess shouldn't have blindly trusted compiler. compiling using /02
default setting. when compile no optimisation expected behaviour:
with same_types
set 1:
same_types: 1 init_to_one: 0 row_size: 600 col_size: 600 time taken: 1009518 (element type uint64_t, row first) sum: 0 time taken: 1504863 (element type uint64_t, col first) sum: 0
with same_types
set 0:
same_types: 0 init_to_one: 0 row_size: 600 col_size: 600 time taken: 909479 (element type uint32_t, row first) sum: 0 time taken: 1244492 (element type uint32_t, col first) sum: 0
the data type still has effect seems reasonable. here assembly when compiling optimisation on:
optimisation ... /o2 same_type ...... 1 ret_type sum_array_rows(const ary_type *a, const size_t step) { 00fd10c0 xorps xmm2,xmm2 ret_type sum = 0; 00fd10c3 lea eax,[ecx+10h] 00fd10c6 movaps xmm1,xmm2 00fd10c9 mov edx,258h 00fd10ce xchg ax,ax (size_t col = 0; col < col_size; col++) 00fd10d0 mov ecx,96h sum += static_cast<ret_type>(a[row * step + col]); 00fd10d5 movups xmm0,xmmword ptr [eax-10h] 00fd10d9 paddq xmm2,xmm0 00fd10dd movups xmm0,xmmword ptr [eax] 00fd10e0 add eax,20h 00fd10e3 paddq xmm1,xmm0 00fd10e7 sub ecx,1 00fd10ea jne sum_array_rows+15h (0fd10d5h) (size_t row = 0; row < row_size; row++) 00fd10ec sub edx,1 (size_t row = 0; row < row_size; row++) 00fd10ef jne sum_array_rows+10h (0fd10d0h) return sum; } 00fd10f1 paddq xmm1,xmm2 00fd10f5 movaps xmm0,xmm1 00fd10f8 psrldq xmm0,8 00fd10fd paddq xmm1,xmm0 00fd1101 movd eax,xmm1 00fd1105 psrldq xmm1,4 00fd110a movd edx,xmm1 00fd110e ret ret_type sum_array_cols(const ary_type *a, const size_t step) { 00fd1110 push ebp 00fd1111 mov ebp,esp 00fd1113 sub esp,24h 00fd1116 push ebx 00fd1117 xorps xmm0,xmm0 ret_type sum = 0; 00fd111a mov dword ptr [ebp-10h],258h 00fd1121 push esi 00fd1122 movlpd qword ptr [ebp-18h],xmm0 00fd1127 lea eax,[ecx+2580h] 00fd112d mov edx,dword ptr [ebp-14h] 00fd1130 push edi 00fd1131 mov edi,dword ptr [sum] 00fd1134 mov dword ptr [ebp-0ch],eax 00fd1137 nop word ptr [eax+eax] (size_t row = 0; row < row_size; row++) 00fd1140 xorps xmm0,xmm0 00fd1143 mov dword ptr [ebp-8],0c8h 00fd114a movlpd qword ptr [sum],xmm0 00fd114f mov ecx,dword ptr [ebp-18h] 00fd1152 mov ebx,dword ptr [ebp-14h] 00fd1155 movlpd qword ptr [ebp-20h],xmm0 00fd115a mov esi,dword ptr [ebp-20h] 00fd115d mov dword ptr [ebp-4],ecx 00fd1160 mov ecx,dword ptr [ebp-1ch] 00fd1163 nop dword ptr [eax] 00fd1167 nop word ptr [eax+eax] sum += static_cast<ret_type>(a[row * step + col]); 00fd1170 add edi,dword ptr [eax-2580h] 00fd1176 mov dword ptr [sum],edi 00fd1179 adc edx,dword ptr [eax-257ch] 00fd117f mov edi,dword ptr [ebp-4] 00fd1182 add edi,dword ptr [eax-12c0h] 00fd1188 mov dword ptr [ebp-4],edi 00fd118b adc ebx,dword ptr [eax-12bch] 00fd1191 add esi,dword ptr [eax] 00fd1193 mov edi,dword ptr [sum] 00fd1196 adc ecx,dword ptr [eax+4] 00fd1199 lea eax,[eax+3840h] 00fd119f sub dword ptr [ebp-8],1 00fd11a3 jne sum_array_cols+60h (0fd1170h) (size_t col = 0; col < col_size; col++) 00fd11a5 add esi,dword ptr [ebp-4] 00fd11a8 mov eax,dword ptr [ebp-0ch] 00fd11ab adc ecx,ebx 00fd11ad add edi,esi 00fd11af adc edx,ecx 00fd11b1 add eax,8 00fd11b4 sub dword ptr [ebp-10h],1 00fd11b8 mov dword ptr [ebp-0ch],eax 00fd11bb jne sum_array_cols+30h (0fd1140h) return sum; 00fd11bd mov eax,edi } 00fd11bf pop edi 00fd11c0 pop esi 00fd11c1 pop ebx 00fd11c2 mov esp,ebp 00fd11c4 pop ebp 00fd11c5 ret ================ optimisation ... /o2 same_type ...... 0 ret_type sum_array_rows(const ary_type *a, const size_t step) { 00a110c0 push ebp 00a110c1 mov ebp,esp 00a110c3 sub esp,24h 00a110c6 push ebx 00a110c7 xorps xmm0,xmm0 ret_type sum = 0; 00a110ca mov dword ptr [ebp-0ch],258h 00a110d1 movlpd qword ptr [ebp-18h],xmm0 00a110d6 mov edx,dword ptr [ebp-14h] 00a110d9 mov eax,dword ptr [sum] 00a110dc push esi 00a110dd push edi 00a110de xchg ax,ax (size_t col = 0; col < col_size; col++) 00a110e0 xorps xmm0,xmm0 00a110e3 mov dword ptr [ebp-8],0c8h 00a110ea movlpd qword ptr [sum],xmm0 00a110ef mov esi,dword ptr [ebp-18h] 00a110f2 mov ebx,dword ptr [ebp-14h] (size_t col = 0; col < col_size; col++) 00a110f5 movlpd qword ptr [ebp-20h],xmm0 00a110fa mov edi,dword ptr [ebp-20h] 00a110fd mov dword ptr [ebp-4],esi 00a11100 mov esi,dword ptr [ebp-1ch] sum += static_cast<ret_type>(a[row * step + col]); 00a11103 add eax,dword ptr [ecx] 00a11105 adc edx,0 00a11108 mov dword ptr [sum],edx 00a1110b mov edx,dword ptr [ebp-4] 00a1110e add edx,dword ptr [ecx+4] 00a11111 mov dword ptr [ebp-4],edx 00a11114 mov edx,dword ptr [sum] 00a11117 adc ebx,0 00a1111a add edi,dword ptr [ecx+8] 00a1111d adc esi,0 00a11120 add ecx,0ch 00a11123 sub dword ptr [ebp-8],1 00a11127 jne sum_array_rows+43h (0a11103h) (size_t row = 0; row < row_size; row++) 00a11129 add edi,dword ptr [ebp-4] 00a1112c adc esi,ebx 00a1112e add eax,edi 00a11130 adc edx,esi 00a11132 sub dword ptr [ebp-0ch],1 00a11136 jne sum_array_rows+20h (0a110e0h) return sum; } 00a11138 pop edi 00a11139 pop esi 00a1113a pop ebx 00a1113b mov esp,ebp 00a1113d pop ebp 00a1113e ret ret_type sum_array_cols(const ary_type *a, const size_t step) { 00a11140 push ebp 00a11141 mov ebp,esp 00a11143 sub esp,24h 00a11146 push ebx 00a11147 xorps xmm0,xmm0 ret_type sum = 0; 00a1114a mov dword ptr [ebp-10h],258h 00a11151 push esi 00a11152 movlpd qword ptr [ebp-18h],xmm0 00a11157 lea eax,[ecx+12c0h] 00a1115d mov edx,dword ptr [ebp-14h] 00a11160 push edi 00a11161 mov edi,dword ptr [sum] 00a11164 mov dword ptr [ebp-0ch],eax 00a11167 nop word ptr [eax+eax] (size_t row = 0; row < row_size; row++) 00a11170 xorps xmm0,xmm0 00a11173 mov dword ptr [ebp-8],0c8h 00a1117a movlpd qword ptr [sum],xmm0 00a1117f mov ecx,dword ptr [ebp-18h] 00a11182 mov ebx,dword ptr [ebp-14h] 00a11185 movlpd qword ptr [ebp-20h],xmm0 00a1118a mov esi,dword ptr [ebp-20h] 00a1118d mov dword ptr [ebp-4],ecx 00a11190 mov ecx,dword ptr [ebp-1ch] 00a11193 nop dword ptr [eax] 00a11197 nop word ptr [eax+eax] sum += static_cast<ret_type>(a[row * step + col]); 00a111a0 add edi,dword ptr [eax-12c0h] 00a111a6 lea eax,[eax+1c20h] 00a111ac adc edx,0 00a111af mov dword ptr [sum],edx 00a111b2 mov edx,dword ptr [ebp-4] 00a111b5 add edx,dword ptr [eax-2580h] 00a111bb mov dword ptr [ebp-4],edx 00a111be mov edx,dword ptr [sum] 00a111c1 adc ebx,0 00a111c4 add esi,dword ptr [eax-1c20h] 00a111ca adc ecx,0 00a111cd sub dword ptr [ebp-8],1 00a111d1 jne sum_array_cols+60h (0a111a0h) (size_t col = 0; col < col_size; col++) 00a111d3 add esi,dword ptr [ebp-4] 00a111d6 mov eax,dword ptr [ebp-0ch] 00a111d9 adc ecx,ebx 00a111db add edi,esi 00a111dd adc edx,ecx 00a111df add eax,4 00a111e2 sub dword ptr [ebp-10h],1 00a111e6 mov dword ptr [ebp-0ch],eax 00a111e9 jne sum_array_cols+30h (0a11170h) return sum; 00a111eb mov eax,edi } 00a111ed pop edi } 00a111ee pop esi 00a111ef pop ebx 00a111f0 mov esp,ebp 00a111f2 pop ebp 00a111f3 ret
the short answer optimizer trying clever heuristics failing in case disparate type sizes, ends slowing down code.
let begin reviewing code generated innerloop of sum_array_rows
64-bit source data.
innerloop: movups xmm0,[eax-0x10] paddq xmm2,xmm0 movups xmm0,[eax] add eax,0x20 paddq xmm1,xmm0 sub ecx,1 jne innerloop
this equivalent following code c using intrinsics.
do { sum1 = _mm_add_epi64(sum1, _mm_loadu_si128(&ptr[0])); sum2 = _mm_add_epi64(sum2, _mm_loadu_si128(&ptr[1])); ptr += 2; } while(--count);
what see here optimizer has determined addition associative , consequently unrolled loop onto 4 parallel accumulators, summed @ end of loop. permits independent computations executed in parallel cpu , more importantly permits vectorization using sse2
instruction set add pairs of 64-bit integers in single instruction.
this good result.
on other side of spectrum have version 64-bit accumulation of 32-bit source data:
innerloop: add eax,[ecx] adc edx,0 mov [sum],edx mov edx,[ebp-4] add edx,[ecx+4] mov [ebp-4],edx mov edx,[sum] adc ebx,0 add edi,[ecx+8] adc esi,0 add ecx,12 sub [ebp-8],1 jne innerloop
note 32-bit x86 target lacks native support 64-bit arithmatic using normal (non vector) instruction set. consequently each accumulator split individual upper , lower word variables, carries lower word manually propagated upper word.
furthermore loop unrolled 3 times instead of four.
the pseudo-c version reads this:
do { sum1_lo += ptr[0]; if(carry) ++sum1_hi; sum2_lo += ptr[1]; if(carry) ++sum2_hi; sum3_lo += ptr[2]; if(carry) ++sum3_hi; ptr += 3; } while(--count);
regrettably unrolling pessimization here processor has run out of registers , forced shunt sum1_lo
/sum2_lo
, count
memory. unrolling factor of 2 have been correct choice, , no unrolling @ faster.
a vectorized version using parallel addition on native 64-bit integers still have been possible. requires source data unpacked first. along these lines:
_m128i 0 = __mm_setzero_epi128(); { _m128i data = _mm_loadu_si128(*ptr++); sum1 = _mm_add_epi64(sum1, _mm_unpacklo_epi64(data, zero)); sum2 = _mm_add_epi64(sum2, _mm_unpackhi_epi64(data, zero)); } while(--count);
i have omitted code folding of intermediate results needlessly computed within outer loop rather waiting end of function. or better yet doing single combined loop on raw col_size*row_size
array.
so is going wrong here? well, modern optimizers complex beasts full of heuristics , lacking insight can speculate.
however simplified model structured passes starting high-level representation , gradually applying transformations lower-level form hoped turn out more efficient. regrettably when relatively high-level transforms such unrolling takes place low-level register allocation has may not yet have taken place, , largely forced guess @ suitable unrolling factor.
it has been experience emulated wide-integer arithmetic receives love , shunted off generic fallbacks , poorly integrated other transforms.
furthermore vectorization in particular difficult process , applied when code falls 1 of patterns known compiler.
in practice consequence of amounts last-straw effect minor alterations of efficient code may fall beyond complexity horizon of optimizer. instead of going , forth indefinitely between passes until optimal result reached compiler forced rely on guesswork performance reasons , cop out, @ point house of cards may come tumbling down.
therefore if project relies on code-generation perform due diligence periodically reviewing output , testing regressions, , in case of critical innerloops advisable explicitly type out closer expected final result rely on fallible transformations.
Comments
Post a Comment