/* * * Copyright (C) 2006-2011 Anders Brander , * * Anders Kvist and Klaus Post * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ #include "dcp.h" #ifdef __AVX__ #include #include /* powf() */ #pragma GCC diagnostic ignored "-Wstrict-aliasing" /* We ignore this pragma, because we are casting a pointer from float to int to pass a float using */ /* _mm_insert_epi32, since no-one was kind enough to include "insertps xmm, mem32, imm8" */ /* as a valid intrinsic. So we use the integer equivalent instead */ /* Regarding table lookups: */ /* We are using double sized tables to avoid cache-splits, */ /* when looking up curve and rgb_tone */ static gfloat _ones_ps[4] __attribute__ ((aligned (16))) = {1.0f, 1.0f, 1.0f, 1.0f}; static gfloat _two_ps[4] __attribute__ ((aligned (16))) = {2.0f, 2.0f, 2.0f, 2.0f}; static gfloat _six_ps[4] __attribute__ ((aligned (16))) = {6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15, 6.0f-1e-15}; static gfloat _very_small_ps[4] __attribute__ ((aligned (16))) = {1e-15, 1e-15, 1e-15, 1e-15}; static const gfloat _two_to_23_ps[4] __attribute__ ((aligned (16))) = { 0x1.0p23f, 0x1.0p23f, 0x1.0p23f, 0x1.0p23f }; #define DW(A) _mm_castps_si128(A) #define PS(A) _mm_castsi128_ps(A) static inline void RGBtoHSV_SSE4(__m128 *c0, __m128 *c1, __m128 *c2) { __m128 zero_ps = _mm_setzero_ps(); __m128 small_ps = _mm_load_ps(_very_small_ps); __m128 ones_ps = _mm_load_ps(_ones_ps); // Any number > 1 __m128 add_v = _mm_load_ps(_two_ps); __m128 r = *c0; __m128 g = *c1; __m128 b = *c2; /* Clamp */ r = _mm_min_ps(_mm_max_ps(r, small_ps),ones_ps); g = _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps); b = _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps); __m128 h, v; v = _mm_max_ps(b,_mm_max_ps(r,g)); __m128 m = _mm_min_ps(b,_mm_min_ps(r,g)); __m128 gap = _mm_sub_ps(v,m); __m128 v_mask = _mm_cmpeq_ps(gap, zero_ps); v = _mm_add_ps(v, _mm_and_ps(add_v, v_mask)); h = _mm_setzero_ps(); /* Set gap to one where sat = 0, this will avoid divisions by zero, these values will not be used */ ones_ps = _mm_and_ps(ones_ps, v_mask); gap = _mm_or_ps(gap, ones_ps); /* gap_inv = 1.0 / gap */ __m128 gap_inv = _mm_rcp_ps(gap); /* if r == v */ /* h = (g - b) / gap; */ __m128 mask = _mm_cmpeq_ps(r, v); __m128 val = _mm_mul_ps(gap_inv, _mm_sub_ps(g, b)); /* fill h */ v = _mm_add_ps(v, _mm_and_ps(add_v, mask)); h = _mm_blendv_ps(h, val, mask); /* if g == v */ /* h = 2.0f + (b - r) / gap; */ __m128 two_ps = _mm_load_ps(_two_ps); mask = _mm_cmpeq_ps(g, v); val = _mm_sub_ps(b, r); val = _mm_mul_ps(val, gap_inv); val = _mm_add_ps(val, two_ps); v = _mm_add_ps(v, _mm_and_ps(add_v, mask)); h = _mm_blendv_ps(h, val, mask); /* If (b == v) */ /* h = 4.0f + (r - g) / gap; */ __m128 four_ps = _mm_add_ps(two_ps, two_ps); mask = _mm_cmpeq_ps(b, v); val = _mm_add_ps(four_ps, _mm_mul_ps(gap_inv, _mm_sub_ps(r, g))); v = _mm_add_ps(v, _mm_and_ps(add_v, mask)); h = _mm_blendv_ps(h, val, mask); __m128 s; /* Fill s, if gap > 0 */ v = _mm_sub_ps(v, add_v); val = _mm_mul_ps(gap,_mm_rcp_ps(v)); s = _mm_andnot_ps(v_mask, val ); /* Check if h < 0 */ zero_ps = _mm_setzero_ps(); __m128 six_ps = _mm_load_ps(_six_ps); mask = _mm_cmplt_ps(h, zero_ps); h = _mm_add_ps(h, _mm_and_ps(mask, six_ps)); *c0 = h; *c1 = s; *c2 = v; } static inline void HSVtoRGB_SSE4(__m128 *c0, __m128 *c1, __m128 *c2) { __m128 h = *c0; __m128 s = *c1; __m128 v = *c2; __m128 r, g, b; /* Convert get the fraction of h * h_fraction = h - (float)(int)h */ __m128 ones_ps = _mm_load_ps(_ones_ps); __m128 h_fraction = _mm_sub_ps(h, _mm_floor_ps(h)); /* p = v * (1.0f - s) */ __m128 p = _mm_mul_ps(v, _mm_sub_ps(ones_ps, s)); /* q = (v * (1.0f - s * f)) */ __m128 q = _mm_mul_ps(v, _mm_sub_ps(ones_ps, _mm_mul_ps(s, h_fraction))); /* t = (v * (1.0f - s * (1.0f - f))) */ __m128 t = _mm_mul_ps(v, _mm_sub_ps(ones_ps, _mm_mul_ps(s, _mm_sub_ps(ones_ps, h_fraction)))); /* h < 1 (case 0)*/ /* case 0: *r = v; *g = t; *b = p; break; */ __m128 h_threshold = _mm_add_ps(ones_ps, ones_ps); __m128 out_mask = _mm_cmplt_ps(h, ones_ps); r = _mm_and_ps(v, out_mask); g = _mm_and_ps(t, out_mask); b = _mm_and_ps(p, out_mask); /* h < 2 (case 1) */ /* case 1: *r = q; *g = v; *b = p; break; */ __m128 m = _mm_cmplt_ps(h, h_threshold); h_threshold = _mm_add_ps(h_threshold, ones_ps); m = _mm_andnot_ps(out_mask, m); r = _mm_blendv_ps(r, q, m); g = _mm_blendv_ps(g, v, m); b = _mm_blendv_ps(b, p, m); out_mask = _mm_or_ps(out_mask, m); /* h < 3 (case 2)*/ /* case 2: *r = p; *g = v; *b = t; break; */ m = _mm_cmplt_ps(h, h_threshold); h_threshold = _mm_add_ps(h_threshold, ones_ps); m = _mm_andnot_ps(out_mask, m); r = _mm_blendv_ps(r, p, m); g = _mm_blendv_ps(g, v, m); b = _mm_blendv_ps(b, t, m); out_mask = _mm_or_ps(out_mask, m); /* h < 4 (case 3)*/ /* case 3: *r = p; *g = q; *b = v; break; */ m = _mm_cmplt_ps(h, h_threshold); h_threshold = _mm_add_ps(h_threshold, ones_ps); m = _mm_andnot_ps(out_mask, m); r = _mm_blendv_ps(r, p, m); g = _mm_blendv_ps(g, q, m); b = _mm_blendv_ps(b, v, m); out_mask = _mm_or_ps(out_mask, m); /* h < 5 (case 4)*/ /* case 4: *r = t; *g = p; *b = v; break; */ m = _mm_cmplt_ps(h, h_threshold); m = _mm_andnot_ps(out_mask, m); r = _mm_or_ps(r, _mm_and_ps(t, m)); g = _mm_or_ps(g, _mm_and_ps(p, m)); b = _mm_or_ps(b, _mm_and_ps(v, m)); out_mask = _mm_or_ps(out_mask, m); /* Remainder (case 5) */ /* case 5: *r = v; *g = p; *b = q; break; */ __m128 all_ones = _mm_cmpeq_ps(h,h); m = _mm_xor_ps(out_mask, all_ones); r = _mm_blendv_ps(r, v, m); g = _mm_blendv_ps(g, p, m); b = _mm_blendv_ps(b, q, m); *c0 = r; *c1 = g; *c2 = b; } /* GCC 4.6.1 seems to miscompile this function with AVX, so disabled for now */ #if 0 static gint _ones_epi32[4] __attribute__ ((aligned (16))) = {1,1,1,1}; void huesat_map_AVX(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, __m128 *_s, __m128 *_v) { __m128 zero_ps = _mm_setzero_ps(); __m128 ones_ps = _mm_load_ps(_ones_ps); __m128 h = *_h; __m128 s = *_s; __m128 v = *_v; /* Clamp - H must be pre-clamped*/ s = _mm_min_ps(_mm_max_ps(s, zero_ps),ones_ps); v = _mm_min_ps(_mm_max_ps(v, zero_ps),ones_ps); gint xfer_0[4] __attribute__ ((aligned (16))); gint xfer_1[4] __attribute__ ((aligned (16))); const gfloat *tableBase = precalc->lookups; __m128 hueShift; __m128 satScale; __m128 valScale; if (map->val_divisions < 2) { __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale)); __m128 sScaled = _mm_mul_ps(s, _mm_load_ps(precalc->sScale)); __m128i hIndex0 = _mm_cvttps_epi32( hScaled ); __m128i sIndex0 = _mm_cvttps_epi32( sScaled ); __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32); __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32); /* We must max here, since otherwise we might get -0 values */ __m128 hFract1 = _mm_max_ps(zero_ps, _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0))); __m128 sFract1 = _mm_max_ps(zero_ps, _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0))); __m128 ones_ps = _mm_load_ps(_ones_ps); __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1); __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1); __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep); __m128i table_offsets = _mm_slli_epi32(_mm_add_epi32(sIndex0, _mm_mullo_epi16(hIndex0, hueStep)), 2); __m128i next_offsets = _mm_slli_epi32(_mm_add_epi32(sIndex0, _mm_mullo_epi16(hIndex1, hueStep)), 2); _mm_store_si128((__m128i*)xfer_0, table_offsets); _mm_store_si128((__m128i*)xfer_1, next_offsets); const gfloat *entry00[4] = { tableBase + xfer_0[0], tableBase + xfer_0[1], tableBase + xfer_0[2], tableBase + xfer_0[3]}; const gfloat *entry01[4] = { tableBase + xfer_1[0], tableBase + xfer_1[1], tableBase + xfer_1[2], tableBase + xfer_1[3]}; __m128 w0 = _mm_shuffle_ps(sFract0, sFract0, 0); __m128 w1 = _mm_shuffle_ps(sFract1, sFract1, 0); __m128 p00 = _mm_load_ps(entry00[0]); __m128 p10 = _mm_load_ps(entry00[0]+4); __m128 p01 = _mm_load_ps(entry01[0]); __m128 p11 = _mm_load_ps(entry01[0]+4); __m128 p0 = _mm_add_ps(_mm_mul_ps(p00, w0), _mm_mul_ps(p10, w1)); __m128 p1 = _mm_add_ps(_mm_mul_ps(p01, w0), _mm_mul_ps(p11, w1)); w0 = _mm_shuffle_ps(hFract0, hFract0, 0); w1 = _mm_shuffle_ps(hFract1, hFract1, 0); __m128 out_p0 = _mm_add_ps(_mm_mul_ps(p0, w0), _mm_mul_ps(p1, w1)); w0 = _mm_shuffle_ps(sFract0, sFract0, _MM_SHUFFLE(1,1,1,1)); w1 = _mm_shuffle_ps(sFract1, sFract1, _MM_SHUFFLE(1,1,1,1)); p00 = _mm_load_ps(entry00[1]); p10 = _mm_load_ps(entry00[1]+4); p01 = _mm_load_ps(entry01[1]); p11 = _mm_load_ps(entry01[1]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0), _mm_mul_ps(p10, w1)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0), _mm_mul_ps(p11, w1)); w0 = _mm_shuffle_ps(hFract0, hFract0, _MM_SHUFFLE(1,1,1,1)); w1 = _mm_shuffle_ps(hFract1, hFract1, _MM_SHUFFLE(1,1,1,1)); __m128 out_p1 = _mm_add_ps(_mm_mul_ps(p0, w0), _mm_mul_ps(p1, w1)); w0 = _mm_shuffle_ps(sFract0, sFract0, _MM_SHUFFLE(2,2,2,2)); w1 = _mm_shuffle_ps(sFract1, sFract1, _MM_SHUFFLE(2,2,2,2)); p00 = _mm_load_ps(entry00[2]); p10 = _mm_load_ps(entry00[2]+4); p01 = _mm_load_ps(entry01[2]); p11 = _mm_load_ps(entry01[2]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0), _mm_mul_ps(p10, w1)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0), _mm_mul_ps(p11, w1)); w0 = _mm_shuffle_ps(hFract0, hFract0, _MM_SHUFFLE(2,2,2,2)); w1 = _mm_shuffle_ps(hFract1, hFract1, _MM_SHUFFLE(2,2,2,2)); __m128 out_p2 = _mm_add_ps(_mm_mul_ps(p0, w0), _mm_mul_ps(p1, w1)); w0 = _mm_shuffle_ps(sFract0, sFract0, _MM_SHUFFLE(3,3,3,3)); w1 = _mm_shuffle_ps(sFract1, sFract1, _MM_SHUFFLE(3,3,3,3)); p00 = _mm_load_ps(entry00[3]); p10 = _mm_load_ps(entry00[3]+4); p01 = _mm_load_ps(entry01[3]); p11 = _mm_load_ps(entry01[3]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0), _mm_mul_ps(p10, w1)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0), _mm_mul_ps(p11, w1)); w0 = _mm_shuffle_ps(hFract0, hFract0, _MM_SHUFFLE(3,3,3,3)); w1 = _mm_shuffle_ps(hFract1, hFract1, _MM_SHUFFLE(3,3,3,3)); __m128 out_p3 = _mm_add_ps(_mm_mul_ps(p0, w0), _mm_mul_ps(p1, w1)); hueShift = _mm_shuffle_ps(out_p0, out_p2, _MM_SHUFFLE(3,0,3,0)); hueShift = _mm_or_ps(hueShift, _mm_shuffle_ps(out_p1, out_p3, _MM_SHUFFLE(0,3,0,3))); satScale = _mm_shuffle_ps(out_p0, out_p2, _MM_SHUFFLE(3,1,3,1)); satScale = _mm_or_ps(satScale, _mm_shuffle_ps(out_p1, out_p3, _MM_SHUFFLE(1,3,1,3))); valScale = _mm_shuffle_ps(out_p0, out_p2, _MM_SHUFFLE(3,2,3,2)); valScale = _mm_or_ps(valScale, _mm_shuffle_ps(out_p1, out_p3, _MM_SHUFFLE(2,3,2,3))); } else { __m128 hScaled = _mm_mul_ps(h, _mm_load_ps(precalc->hScale)); __m128 sScaled = _mm_mul_ps(s, _mm_load_ps(precalc->sScale)); __m128 vScaled = _mm_mul_ps(v, _mm_load_ps(precalc->vScale)); __m128i hIndex0 = _mm_cvttps_epi32(hScaled); __m128i sIndex0 = _mm_cvttps_epi32(sScaled); __m128i vIndex0 = _mm_cvttps_epi32(vScaled); __m128i ones_epi32 = _mm_load_si128((__m128i*)_ones_epi32); __m128i hIndex1 = _mm_add_epi32(hIndex0, ones_epi32); __m128 hFract1 = _mm_max_ps(zero_ps, _mm_sub_ps( hScaled, _mm_cvtepi32_ps(hIndex0))); __m128 sFract1 = _mm_max_ps(zero_ps, _mm_sub_ps( sScaled, _mm_cvtepi32_ps(sIndex0))); __m128 vFract1 = _mm_max_ps(zero_ps, _mm_sub_ps( vScaled, _mm_cvtepi32_ps(vIndex0))); __m128 ones_ps = _mm_load_ps(_ones_ps); __m128 hFract0 = _mm_sub_ps(ones_ps, hFract1); __m128 sFract0 = _mm_sub_ps(ones_ps, sFract1); __m128 vFract0 = _mm_sub_ps(ones_ps, vFract1); __m128i hueStep = _mm_load_si128((__m128i*)precalc->hueStep); __m128i valStep = _mm_load_si128((__m128i*)precalc->valStep); // This requires that hueStep and valStep can be contained in a 16 bit signed integer. __m128i table_offsets = _mm_add_epi32(sIndex0, _mm_mullo_epi16(vIndex0, valStep)); __m128i next_offsets = _mm_mullo_epi16(hIndex1, hueStep); next_offsets = _mm_slli_epi32(_mm_add_epi32(next_offsets, table_offsets), 2); table_offsets = _mm_slli_epi32(_mm_add_epi32(table_offsets, _mm_mullo_epi16(hIndex0, hueStep)), 2); _mm_store_si128((__m128i*)xfer_0, table_offsets); _mm_store_si128((__m128i*)xfer_1, next_offsets); guint _valStep = precalc->valStep[0] * 4; const gfloat *entry00[4] = { tableBase + xfer_0[0], tableBase + xfer_0[1], tableBase + xfer_0[2], tableBase + xfer_0[3]}; const gfloat *entry01[4] = { tableBase + xfer_1[0], tableBase + xfer_1[1], tableBase + xfer_1[2], tableBase + xfer_1[3]}; const gfloat *entry10[4] = { entry00[0] + _valStep, entry00[1] + _valStep, entry00[2] + _valStep, entry00[3] + _valStep}; const gfloat *entry11[4] = { entry01[0] + _valStep, entry01[1] + _valStep, entry01[2] + _valStep, entry01[3] + _valStep}; __m128 w0s = _mm_shuffle_ps(sFract0, sFract0, 0); __m128 w1s = _mm_shuffle_ps(sFract1, sFract1, 0); __m128 p00 = _mm_load_ps(entry00[0]); __m128 p10 = _mm_load_ps(entry00[0]+4); __m128 p01 = _mm_load_ps(entry01[0]); __m128 p11 = _mm_load_ps(entry01[0]+4); __m128 p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); __m128 p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); __m128 w0h = _mm_shuffle_ps(hFract0, hFract0, 0); __m128 w1h = _mm_shuffle_ps(hFract1, hFract1, 0); __m128 p0u = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); p00 = _mm_load_ps(entry10[0]); p10 = _mm_load_ps(entry10[0]+4); p01 = _mm_load_ps(entry11[0]); p11 = _mm_load_ps(entry11[0]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); __m128 p0d = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); __m128 w0v = _mm_shuffle_ps(vFract0, vFract0, 0); __m128 w1v = _mm_shuffle_ps(vFract1, vFract1, 0); __m128 out_p0 = _mm_add_ps(_mm_mul_ps(p0u, w0v), _mm_mul_ps(p0d, w1v)); // Pixel 2 w0s = _mm_shuffle_ps(sFract0, sFract0, _MM_SHUFFLE(1,1,1,1)); w1s = _mm_shuffle_ps(sFract1, sFract1, _MM_SHUFFLE(1,1,1,1)); p00 = _mm_load_ps(entry00[1]); p10 = _mm_load_ps(entry00[1]+4); p01 = _mm_load_ps(entry01[1]); p11 = _mm_load_ps(entry01[1]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); w0h = _mm_shuffle_ps(hFract0, hFract0, _MM_SHUFFLE(1,1,1,1)); w1h = _mm_shuffle_ps(hFract1, hFract1, _MM_SHUFFLE(1,1,1,1)); p0u = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); p00 = _mm_load_ps(entry10[1]); p10 = _mm_load_ps(entry10[1]+4); p01 = _mm_load_ps(entry11[1]); p11 = _mm_load_ps(entry11[1]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); p0d = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); w0v = _mm_shuffle_ps(vFract0, vFract0, _MM_SHUFFLE(1,1,1,1)); w1v = _mm_shuffle_ps(vFract1, vFract1, _MM_SHUFFLE(1,1,1,1)); __m128 out_p1 = _mm_add_ps(_mm_mul_ps(p0u, w0v), _mm_mul_ps(p0d, w1v)); // Pixel 3 w0s = _mm_shuffle_ps(sFract0, sFract0, _MM_SHUFFLE(2,2,2,2)); w1s = _mm_shuffle_ps(sFract1, sFract1, _MM_SHUFFLE(2,2,2,2)); p00 = _mm_load_ps(entry00[2]); p10 = _mm_load_ps(entry00[2]+4); p01 = _mm_load_ps(entry01[2]); p11 = _mm_load_ps(entry01[2]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); w0h = _mm_shuffle_ps(hFract0, hFract0, _MM_SHUFFLE(2,2,2,2)); w1h = _mm_shuffle_ps(hFract1, hFract1, _MM_SHUFFLE(2,2,2,2)); p0u = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); p00 = _mm_load_ps(entry10[2]); p10 = _mm_load_ps(entry10[2]+4); p01 = _mm_load_ps(entry11[2]); p11 = _mm_load_ps(entry11[2]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); p0d = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); w0v = _mm_shuffle_ps(vFract0, vFract0, _MM_SHUFFLE(2,2,2,2)); w1v = _mm_shuffle_ps(vFract1, vFract1, _MM_SHUFFLE(2,2,2,2)); __m128 out_p2 = _mm_add_ps(_mm_mul_ps(p0u, w0v), _mm_mul_ps(p0d, w1v)); // Pixel 4 w0s = _mm_shuffle_ps(sFract0, sFract0, _MM_SHUFFLE(3,3,3,3)); w1s = _mm_shuffle_ps(sFract1, sFract1, _MM_SHUFFLE(3,3,3,3)); p00 = _mm_load_ps(entry00[3]); p10 = _mm_load_ps(entry00[3]+4); p01 = _mm_load_ps(entry01[3]); p11 = _mm_load_ps(entry01[3]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); w0h = _mm_shuffle_ps(hFract0, hFract0, _MM_SHUFFLE(3,3,3,3)); w1h = _mm_shuffle_ps(hFract1, hFract1, _MM_SHUFFLE(3,3,3,3)); p0u = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); p00 = _mm_load_ps(entry10[3]); p10 = _mm_load_ps(entry10[3]+4); p01 = _mm_load_ps(entry11[3]); p11 = _mm_load_ps(entry11[3]+4); p0 = _mm_add_ps(_mm_mul_ps(p00, w0s), _mm_mul_ps(p10, w1s)); p1 = _mm_add_ps(_mm_mul_ps(p01, w0s), _mm_mul_ps(p11, w1s)); p0d = _mm_add_ps(_mm_mul_ps(p0, w0h), _mm_mul_ps(p1, w1h)); w0v = _mm_shuffle_ps(vFract0, vFract0, _MM_SHUFFLE(3,3,3,3)); w1v = _mm_shuffle_ps(vFract1, vFract1, _MM_SHUFFLE(3,3,3,3)); __m128 out_p3 = _mm_add_ps(_mm_mul_ps(p0u, w0v), _mm_mul_ps(p0d, w1v)); hueShift = _mm_shuffle_ps(out_p0, out_p2, _MM_SHUFFLE(3,0,3,0)); hueShift = _mm_or_ps(hueShift, _mm_shuffle_ps(out_p1, out_p3, _MM_SHUFFLE(0,3,0,3))); satScale = _mm_shuffle_ps(out_p0, out_p2, _MM_SHUFFLE(3,1,3,1)); satScale = _mm_or_ps(satScale, _mm_shuffle_ps(out_p1, out_p3, _MM_SHUFFLE(1,3,1,3))); valScale = _mm_shuffle_ps(out_p0, out_p2, _MM_SHUFFLE(3,2,3,2)); valScale = _mm_or_ps(valScale, _mm_shuffle_ps(out_p1, out_p3, _MM_SHUFFLE(2,3,2,3))); } ones_ps = _mm_load_ps(_ones_ps); s = _mm_min_ps(ones_ps, _mm_mul_ps(s, satScale)); v = _mm_min_ps(ones_ps, _mm_mul_ps(v, valScale)); h = _mm_add_ps(h, hueShift); *_h = h; *_s = s; *_v = v; } #endif static gfloat _16_bit_ps[4] __attribute__ ((aligned (16))) = {65535.0, 65535.0, 65535.0, 65535.0}; static gfloat _thousand_24_ps[4] __attribute__ ((aligned (16))) = {1023.99999f, 1023.99999f, 1023.99999f, 1023.99999f}; static inline __m128 curve_interpolate_lookup(__m128 value, const gfloat * const tone_lut) { int xfer[8] __attribute__ ((aligned (16))); /* Convert v to lookup values and interpolate */ __m128 mul = _mm_mul_ps(value, _mm_load_ps(_thousand_24_ps)); __m128i lookup = _mm_cvtps_epi32(mul); _mm_store_si128((__m128i*)&xfer[0], lookup); /* Calculate fractions */ __m128 frac = _mm_sub_ps(mul, _mm_floor_ps(mul)); __m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac); /* Load two adjacent curve values and interpolate between them */ __m128 p0p1 = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[0]*2])); __m128 p2p3 = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&tone_lut[xfer[2]*2])); p0p1 = _mm_loadh_pi(p0p1, (__m64*)&tone_lut[xfer[1]*2]); p2p3 = _mm_loadh_pi(p2p3, (__m64*)&tone_lut[xfer[3]*2]); /* Pack all lower values in v0, high in v1 and interpolate */ __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(2,0,2,0)); __m128 v1 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(3,1,3,1)); return _mm_add_ps(_mm_mul_ps(inv_frac, v0), _mm_mul_ps(frac, v1)); } static inline void rgb_tone_sse2(__m128* _r, __m128* _g, __m128* _b, const gfloat * const tone_lut) { __m128 r = *_r; __m128 g = *_g; __m128 b = *_b; __m128 small_ps = _mm_load_ps(_very_small_ps); __m128 ones_ps = _mm_load_ps(_ones_ps); /* Clamp to avoid lookups out of table */ r = _mm_min_ps(_mm_max_ps(r, small_ps),ones_ps); g = _mm_min_ps(_mm_max_ps(g, small_ps),ones_ps); b = _mm_min_ps(_mm_max_ps(b, small_ps),ones_ps); /* Find largest and smallest values */ __m128 lg = _mm_max_ps(b, _mm_max_ps(r, g)); __m128 sm = _mm_min_ps(b, _mm_min_ps(r, g)); /* Lookup */ __m128 LG = curve_interpolate_lookup(lg, tone_lut); __m128 SM = curve_interpolate_lookup(sm, tone_lut); /* Create masks for largest, smallest and medium values */ /* This is done in integer AVX, since they have double the throughput */ __m128i ones = _mm_cmpeq_epi32(DW(r), DW(r)); __m128i is_r_lg = _mm_cmpeq_epi32(DW(r), DW(lg)); __m128i is_g_lg = _mm_cmpeq_epi32(DW(g), DW(lg)); __m128i is_b_lg = _mm_cmpeq_epi32(DW(b), DW(lg)); __m128i is_r_sm = _mm_andnot_si128(is_r_lg, _mm_cmpeq_epi32(DW(r), DW(sm))); __m128i is_g_sm = _mm_andnot_si128(is_g_lg, _mm_cmpeq_epi32(DW(g), DW(sm))); __m128i is_b_sm = _mm_andnot_si128(is_b_lg, _mm_cmpeq_epi32(DW(b), DW(sm))); __m128i is_r_md = _mm_xor_si128(ones, _mm_or_si128(is_r_lg, is_r_sm)); __m128i is_g_md = _mm_xor_si128(ones, _mm_or_si128(is_g_lg, is_g_sm)); __m128i is_b_md = _mm_xor_si128(ones, _mm_or_si128(is_b_lg, is_b_sm)); /* Find all medium values based on masks */ __m128 md = PS(_mm_or_si128(_mm_or_si128( _mm_and_si128(DW(r), is_r_md), _mm_and_si128(DW(g), is_g_md)), _mm_and_si128(DW(b), is_b_md))); /* Calculate tone corrected medium value */ __m128 p = _mm_rcp_ps(_mm_sub_ps(lg, sm)); __m128 q = _mm_sub_ps(md, sm); __m128 o = _mm_sub_ps(LG, SM); __m128 MD = _mm_add_ps(SM, _mm_mul_ps(o, _mm_mul_ps(p, q))); /* Inserted here again, to lighten register presssure */ is_r_lg = _mm_cmpeq_epi32(DW(r), DW(lg)); is_g_lg = _mm_cmpeq_epi32(DW(g), DW(lg)); is_b_lg = _mm_cmpeq_epi32(DW(b), DW(lg)); /* Combine corrected values to output RGB */ r = PS(_mm_or_si128( _mm_or_si128( _mm_and_si128(DW(LG), is_r_lg), _mm_and_si128(DW(SM), is_r_sm)), _mm_and_si128(DW(MD), is_r_md))); g = PS(_mm_or_si128( _mm_or_si128( _mm_and_si128(DW(LG), is_g_lg), _mm_and_si128(DW(SM), is_g_sm)), _mm_and_si128(DW(MD), is_g_md))); b = PS(_mm_or_si128( _mm_or_si128( _mm_and_si128(DW(LG), is_b_lg), _mm_and_si128(DW(SM), is_b_sm)), _mm_and_si128(DW(MD), is_b_md))); *_r = r; *_g = g; *_b = b; } #undef DW #undef PS static inline __m128 sse_matrix3_mul(float* mul, __m128 a, __m128 b, __m128 c) { __m128 v = _mm_load_ps(mul); __m128 acc = _mm_mul_ps(a, v); v = _mm_load_ps(mul+4); acc = _mm_add_ps(acc, _mm_mul_ps(b, v)); v = _mm_load_ps(mul+8); acc = _mm_add_ps(acc, _mm_mul_ps(c, v)); return acc; } static gfloat _rgb_div_ps[4] __attribute__ ((aligned (16))) = {1.0/65535.0, 1.0/65535.0, 1.0/65535.0, 1.0/65535.0}; static gint _15_bit_epi32[4] __attribute__ ((aligned (16))) = { 32768, 32768, 32768, 32768}; static guint _16_bit_sign[4] __attribute__ ((aligned (16))) = {0x80008000,0x80008000,0x80008000,0x80008000}; static gfloat _twofiftysix_ps[4] __attribute__ ((aligned (16))) = {255.9999f,255.9999f,255.9999f,255.9999f}; #define SETFLOAT4(N, A, B, C, D) float N[4] __attribute__ ((aligned (16))); \ N[0] = D; N[1] = C; N[2] = B; N[3] = A; #define SETFLOAT4_SAME(N, A) float N[4] __attribute__ ((aligned (16))); \ N[0] = A; N[1] = A; N[2] = A; N[3] = A; extern void huesat_map_SSE2(RSHuesatMap *map, const PrecalcHSM* precalc, __m128 *_h, __m128 *_s, __m128 *_v); gboolean render_AVX(ThreadInfo* t) { RS_IMAGE16 *image = t->tmp; RSDcp *dcp = t->dcp; gint x, y; __m128 h, s, v; __m128i p1,p2; __m128 p1f, p2f, p3f, p4f; __m128 r, g, b, r2, g2, b2; __m128i zero; int _mm_rounding = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN); __m128 hue_add = _mm_set_ps(dcp->hue, dcp->hue, dcp->hue, dcp->hue); __m128 sat; if (dcp->saturation > 1.0) sat = _mm_set_ps(dcp->saturation-1.0f, dcp->saturation-1.0f, dcp->saturation-1.0f, dcp->saturation-1.0f); else sat = _mm_set_ps(dcp->saturation, dcp->saturation, dcp->saturation, dcp->saturation); gboolean do_contrast = (dcp->contrast > 1.001f); gboolean do_highrec = (dcp->contrast < 0.999f); float exposure_simple = MAX(1.0, powf(2.0f, dcp->exposure)); float __recover_radius = 0.5 * exposure_simple; SETFLOAT4_SAME(_inv_recover_radius, 1.0f / __recover_radius); SETFLOAT4_SAME(_recover_radius, 1.0 - __recover_radius); int xfer[4] __attribute__ ((aligned (16))); SETFLOAT4_SAME(_min_cam, 1.0); SETFLOAT4_SAME(_black_minus_radius, dcp->exposure_black - dcp->exposure_radius); SETFLOAT4_SAME(_black_plus_radius, dcp->exposure_black + dcp->exposure_radius); SETFLOAT4_SAME(_exposure_black, dcp->exposure_black); SETFLOAT4_SAME(_exposure_slope, dcp->exposure_slope); SETFLOAT4_SAME(_exposure_qscale, dcp->exposure_qscale); SETFLOAT4_SAME(_contrast, dcp->contrast); SETFLOAT4_SAME(_inv_contrast, 1.0f - dcp->contrast); SETFLOAT4_SAME(_contr_base, 0.5f); if (dcp->use_profile) { _min_cam[0] = dcp->camera_white.x; _min_cam[1] = dcp->camera_white.y; _min_cam[2] = dcp->camera_white.z; _min_cam[3] = 0.0; } float cam_prof[4*4*3] __attribute__ ((aligned (16))); for (x = 0; x < 4; x++ ) { cam_prof[x] = dcp->camera_to_prophoto.coeff[0][0] * dcp->channelmixer_red; cam_prof[x+4] = dcp->camera_to_prophoto.coeff[0][1] * dcp->channelmixer_red; cam_prof[x+8] = dcp->camera_to_prophoto.coeff[0][2] * dcp->channelmixer_red; cam_prof[12+x] = dcp->camera_to_prophoto.coeff[1][0] * dcp->channelmixer_green; cam_prof[12+x+4] = dcp->camera_to_prophoto.coeff[1][1] * dcp->channelmixer_green; cam_prof[12+x+8] = dcp->camera_to_prophoto.coeff[1][2] * dcp->channelmixer_green; cam_prof[24+x] = dcp->camera_to_prophoto.coeff[2][0] * dcp->channelmixer_blue; cam_prof[24+x+4] = dcp->camera_to_prophoto.coeff[2][1] * dcp->channelmixer_blue; cam_prof[24+x+8] = dcp->camera_to_prophoto.coeff[2][2]* dcp->channelmixer_blue; } gint end_x = image->w - (image->w & 3); for(y = t->start_y ; y < t->end_y; y++) { __m128i* pixel = (__m128i*)GET_PIXEL(image, 0, y); /* Prefetch next line */ _mm_prefetch((char*)(pixel)+image->rowstride*2, _MM_HINT_NTA); for(x=0; x < end_x; x+=4) { zero = _mm_setzero_si128(); /* Convert to float */ p1 = _mm_load_si128(pixel); p2 = _mm_load_si128(pixel + 1); _mm_prefetch((char*)(pixel+4), _MM_HINT_NTA); /* Unpack to R G B x */ p2f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(p1, zero)); p4f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(p2, zero)); p1f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p1, zero)); p3f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(p2, zero)); /* Normalize to 0 to 1 range */ __m128 rgb_div = _mm_load_ps(_rgb_div_ps); p1f = _mm_mul_ps(p1f, rgb_div); p2f = _mm_mul_ps(p2f, rgb_div); p3f = _mm_mul_ps(p3f, rgb_div); p4f = _mm_mul_ps(p4f, rgb_div); if (dcp->use_profile) { /* Restric to camera white */ __m128 min_cam = _mm_load_ps(_min_cam); p1f = _mm_min_ps(p1f, min_cam); p2f = _mm_min_ps(p2f, min_cam); p3f = _mm_min_ps(p3f, min_cam); p4f = _mm_min_ps(p4f, min_cam); } /* Convert to planar */ __m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f); __m128 b1b0 = _mm_unpackhi_ps(p1f, p2f); __m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f); __m128 b3b2 = _mm_unpackhi_ps(p3f, p4f); r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2); g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0); b = _mm_movelh_ps(b1b0, b3b2); /* Convert to Prophoto */ r2 = sse_matrix3_mul(cam_prof, r, g, b); g2 = sse_matrix3_mul(&cam_prof[12], r, g, b); b2 = sse_matrix3_mul(&cam_prof[24], r, g, b); RGBtoHSV_SSE4(&r2, &g2, &b2); h = r2; s = g2; v = b2; if (dcp->huesatmap) { huesat_map_SSE2(dcp->huesatmap, dcp->huesatmap_precalc, &h, &s, &v); } /* Saturation */ __m128 max_val = _mm_load_ps(_ones_ps); __m128 min_val = _mm_load_ps(_very_small_ps); if (dcp->saturation > 1.0) { __m128 two_ps = _mm_load_ps(_two_ps); __m128 ones_ps = _mm_load_ps(_ones_ps); /* out = (sat) * (x*2-x^2.0) + ((1.0-sat)*x) */ __m128 s_curved = _mm_mul_ps(sat, _mm_sub_ps(_mm_mul_ps(s, two_ps), _mm_mul_ps(s,s))); s = _mm_min_ps(max_val, _mm_add_ps(s_curved, _mm_mul_ps(s, _mm_sub_ps(ones_ps, sat)))); } else { s = _mm_max_ps(min_val, _mm_min_ps(max_val, _mm_mul_ps(s, sat))); } /* Hue */ __m128 six_ps = _mm_load_ps(_six_ps); __m128 zero_ps = _mm_setzero_ps(); h = _mm_add_ps(h, hue_add); /* Check if hue >= 6 or < 0*/ __m128 h_mask_gt = _mm_cmpge_ps(h, six_ps); __m128 h_mask_lt = _mm_cmplt_ps(h, zero_ps); __m128 six_masked_gt = _mm_and_ps(six_ps, h_mask_gt); __m128 six_masked_lt = _mm_and_ps(six_ps, h_mask_lt); h = _mm_sub_ps(h, six_masked_gt); h = _mm_add_ps(h, six_masked_lt); __m128 v_stored = v; HSVtoRGB_SSE4(&h, &s, &v); r = h; g = s; b = v; /* Exposure */ /* y = x - (dcp->exposure_black - dcp->exposure_radius); */ /* x = dcp->exposure_qscale * y * y; */ __m128 black_minus_radius = _mm_load_ps(_black_minus_radius); __m128 y_r = _mm_sub_ps(r, black_minus_radius); __m128 y_g = _mm_sub_ps(g, black_minus_radius); __m128 y_b = _mm_sub_ps(b, black_minus_radius); __m128 exposure_qscale = _mm_load_ps(_exposure_qscale); y_r = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_r, y_r)); y_g = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_g, y_g)); y_b = _mm_mul_ps(exposure_qscale,_mm_mul_ps(y_b, y_b)); /* if (x >= dcp->exposure_black + dcp->exposure_radius) */ /* x = (x - dcp->exposure_black) * dcp->exposure_slope; */ __m128 exposure_slope = _mm_load_ps(_exposure_slope); __m128 exposure_black = _mm_load_ps(_exposure_black); __m128 y2_r = _mm_mul_ps(exposure_slope, _mm_sub_ps(r, exposure_black)); __m128 y2_g = _mm_mul_ps(exposure_slope, _mm_sub_ps(g, exposure_black)); __m128 y2_b = _mm_mul_ps(exposure_slope, _mm_sub_ps(b, exposure_black)); __m128 black_plus_radius = _mm_load_ps(_black_plus_radius); __m128 r_mask = _mm_cmpgt_ps(r, black_plus_radius); __m128 g_mask = _mm_cmpgt_ps(g, black_plus_radius); __m128 b_mask = _mm_cmpgt_ps(b, black_plus_radius); y_r = _mm_andnot_ps(r_mask, y_r); y_g = _mm_andnot_ps(g_mask, y_g); y_b = _mm_andnot_ps(b_mask, y_b); y_r = _mm_or_ps(y_r, _mm_and_ps(r_mask, y2_r)); y_g = _mm_or_ps(y_g, _mm_and_ps(g_mask, y2_g)); y_b = _mm_or_ps(y_b, _mm_and_ps(b_mask, y2_b)); /* if (x <= dcp->exposure_black - dcp->exposure_radius) x = 0; */ black_minus_radius = _mm_load_ps(_black_minus_radius); r_mask = _mm_cmple_ps(r, black_minus_radius); g_mask = _mm_cmple_ps(g, black_minus_radius); b_mask = _mm_cmple_ps(b, black_minus_radius); r = _mm_andnot_ps(r_mask, y_r); g = _mm_andnot_ps(g_mask, y_g); b = _mm_andnot_ps(b_mask, y_b); /* Contrast in gamma 2.0 */ if (do_contrast) { __m128 contr_base = _mm_load_ps(_contr_base); __m128 contrast = _mm_load_ps(_contrast); min_val = _mm_load_ps(_very_small_ps); r = _mm_max_ps(r, min_val); g = _mm_max_ps(g, min_val); b = _mm_max_ps(b, min_val); r = _mm_add_ps(_mm_mul_ps(contrast, _mm_sub_ps(_mm_rcp_ps(_mm_rsqrt_ps(r)), contr_base)), contr_base); g = _mm_add_ps(_mm_mul_ps(contrast, _mm_sub_ps(_mm_rcp_ps(_mm_rsqrt_ps(g)), contr_base)), contr_base); b = _mm_add_ps(_mm_mul_ps(contrast, _mm_sub_ps(_mm_rcp_ps(_mm_rsqrt_ps(b)), contr_base)), contr_base); r = _mm_max_ps(r, min_val); g = _mm_max_ps(g, min_val); b = _mm_max_ps(b, min_val); r = _mm_mul_ps(r,r); g = _mm_mul_ps(g,g); b = _mm_mul_ps(b,b); } else if (do_highrec) { max_val = _mm_load_ps(_ones_ps); __m128 inv_contrast = _mm_load_ps(_inv_contrast); __m128 recover_radius = _mm_load_ps(_recover_radius); __m128 inv_recover_radius = _mm_load_ps(_inv_recover_radius); /* Distance from 1.0 - radius */ __m128 dist = _mm_sub_ps(v_stored, recover_radius); /* Scale so distance is normalized, clamp */ __m128 dist_scaled = _mm_min_ps(max_val, _mm_mul_ps(dist, inv_recover_radius)); __m128 mul_val = _mm_sub_ps(max_val, _mm_mul_ps(dist_scaled, inv_contrast)); r = _mm_mul_ps(r, mul_val); g = _mm_mul_ps(g, mul_val); b = _mm_mul_ps(b, mul_val); } /* Convert to HSV */ RGBtoHSV_SSE4(&r, &g, &b); h = r; s = g; v = b; if (!dcp->curve_is_flat) { /* Convert v to lookup values and interpolate */ __m128 v_mul = _mm_mul_ps(v, _mm_load_ps(_twofiftysix_ps)); __m128i lookup = _mm_cvtps_epi32(v_mul); _mm_store_si128((__m128i*)&xfer[0], lookup); /* Calculate fractions */ __m128 frac = _mm_sub_ps(v_mul, _mm_floor_ps(v_mul)); __m128 inv_frac = _mm_sub_ps(_mm_load_ps(_ones_ps), frac); /* Load two adjacent curve values and interpolate between them */ __m128 p0p1 = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[0]*2])); __m128 p2p3 = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)&dcp->curve_samples[xfer[2]*2])); p0p1 = _mm_loadh_pi(p0p1, (__m64*)&dcp->curve_samples[xfer[1]*2]); p2p3 = _mm_loadh_pi(p2p3, (__m64*)&dcp->curve_samples[xfer[3]*2]); /* Pack all lower values in v0, high in v1 and interpolate */ __m128 v0 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(2,0,2,0)); __m128 v1 = _mm_shuffle_ps(p0p1, p2p3, _MM_SHUFFLE(3,1,3,1)); v = _mm_add_ps(_mm_mul_ps(inv_frac, v0), _mm_mul_ps(frac, v1)); } /* Apply looktable */ if (dcp->looktable) { huesat_map_SSE2(dcp->looktable, dcp->looktable_precalc, &h, &s, &v); } /* Ensure that hue is within range */ zero_ps = _mm_setzero_ps(); h_mask_gt = _mm_cmpge_ps(h, six_ps); h_mask_lt = _mm_cmplt_ps(h, zero_ps); six_masked_gt = _mm_and_ps(six_ps, h_mask_gt); six_masked_lt = _mm_and_ps(six_ps, h_mask_lt); h = _mm_sub_ps(h, six_masked_gt); h = _mm_add_ps(h, six_masked_lt); /* s always slightly > 0 when converting to RGB */ s = _mm_max_ps(s, min_val); HSVtoRGB_SSE4(&h, &s, &v); r = h; g = s; b = v; /* Apply Tone Curve in RGB space*/ if (dcp->tone_curve_lut) { rgb_tone_sse2( &r, &g, &b, dcp->tone_curve_lut); } /* Convert to 16 bit */ __m128 rgb_mul = _mm_load_ps(_16_bit_ps); r = _mm_mul_ps(r, rgb_mul); g = _mm_mul_ps(g, rgb_mul); b = _mm_mul_ps(b, rgb_mul); __m128i r_i = _mm_cvtps_epi32(r); __m128i g_i = _mm_cvtps_epi32(g); __m128i b_i = _mm_cvtps_epi32(b); __m128i sub_32 = _mm_load_si128((__m128i*)_15_bit_epi32); __m128i signxor = _mm_load_si128((__m128i*)_16_bit_sign); /* Subtract 32768 to avoid saturation */ r_i = _mm_sub_epi32(r_i, sub_32); g_i = _mm_sub_epi32(g_i, sub_32); b_i = _mm_sub_epi32(b_i, sub_32); /* 32 bit signed -> 16 bit signed conversion, all in lower 64 bit */ r_i = _mm_packs_epi32(r_i, r_i); g_i = _mm_packs_epi32(g_i, g_i); b_i = _mm_packs_epi32(b_i, b_i); /* Interleave*/ __m128i rg_i = _mm_unpacklo_epi16(r_i, g_i); __m128i bb_i = _mm_unpacklo_epi16(b_i, b_i); p1 = _mm_unpacklo_epi32(rg_i, bb_i); p2 = _mm_unpackhi_epi32(rg_i, bb_i); /* Convert sign back */ p1 = _mm_xor_si128(p1, signxor); p2 = _mm_xor_si128(p2, signxor); /* Store processed pixel */ _mm_store_si128(pixel, p1); _mm_store_si128(pixel + 1, p2); pixel += 2; } } _MM_SET_ROUNDING_MODE(_mm_rounding); return TRUE; } #undef SETFLOAT4 #undef SETFLOAT4_SAME #else // if not __AVX__ gboolean render_AVX(ThreadInfo* t) { return FALSE; } #endif