I'm converting a project to compile with gcc from clang and I've ran into a issue with a function that uses sse functions:
void dodgy_function( const short* lows, const short* highs, short* mins, short* maxs, int its ) { __m128i v00[2] = { _mm_setzero_si128(), _mm_setzero_si128() }; __m128i v10[2] = { _mm_setzero_si128(), _mm_setzero_si128() }; for (int i = 0; i < its; ++i) { reinterpret_cast<short*>(v00)[i] = lows[i]; reinterpret_cast<short*>(v10)[i] = highs[i]; } reinterpret_cast<short*>(v00)[its] = reinterpret_cast<short*>(v00)[its - 1]; reinterpret_cast<short*>(v10)[its] = reinterpret_cast<short*>(v10)[its - 1]; __m128i v01[2] = {_mm_setzero_si128(), _mm_setzero_si128()}; __m128i v11[2] = {_mm_setzero_si128(), _mm_setzero_si128()}; __m128i min[2]; __m128i max[2]; min[0] = _mm_min_epi16(_mm_max_epi16(v11[0], v01[0]), _mm_min_epi16(v10[0], v00[0])); max[0] = _mm_max_epi16(_mm_max_epi16(v11[0], v01[0]), _mm_max_epi16(v10[0], v00[0])); min[1] = _mm_min_epi16(_mm_min_epi16(v11[1], v01[1]), _mm_min_epi16(v10[1], v00[1])); max[1] = _mm_max_epi16(_mm_max_epi16(v11[1], v01[1]), _mm_max_epi16(v10[1], v00[1])); reinterpret_cast<__m128i*>(mins)[0] = _mm_min_epi16(reinterpret_cast<__m128i*>(mins)[0], min[0]); reinterpret_cast<__m128i*>(maxs)[0] = _mm_max_epi16(reinterpret_cast<__m128i*>(maxs)[0], max[0]); reinterpret_cast<__m128i*>(mins)[1] = _mm_min_epi16(reinterpret_cast<__m128i*>(mins)[1], min[1]); reinterpret_cast<__m128i*>(maxs)[1] = _mm_max_epi16(reinterpret_cast<__m128i*>(maxs)[1], max[1]); }
Now with clang it gives it gives me the expected output but in gcc it prints all zeros: godbolt link
Playing around I discovered that gcc gives me the right results when I compile with -O1 but goes wrong with -O2 and -O3, suggesting the optimiser is going awry. Is there something particularly wrong I'm doing that would cause this behavior?
As a workaround I can wrap things up in a union and gcc will then give me the right result, but that feels a little icky: godbolt link 2
Any ideas?
https://stackoverflow.com/questions/66606202/gcc-misoptimises-sse-function March 13, 2021 at 03:39AM
没有评论:
发表评论