From fc36f3c47893a55651d06c2b48555d6a6ea1a498 Mon Sep 17 00:00:00 2001 From: Marek Dvoroznak Date: Wed, 11 Mar 2026 20:46:59 +0100 Subject: [PATCH] Update stb to the latest version --- third_party/stb/stb_c_lexer.h | 127 +- third_party/stb/stb_divide.h | 21 +- third_party/stb/stb_ds.h | 63 +- third_party/stb/stb_dxt.h | 334 +- third_party/stb/stb_hexwave.h | 680 ++ third_party/stb/stb_image.h | 760 +- third_party/stb/stb_image_resize2.h | 10651 +++++++++++++++++++++++++ third_party/stb/stb_image_write.h | 80 +- third_party/stb/stb_rect_pack.h | 35 +- third_party/stb/stb_sprintf.h | 121 +- third_party/stb/stb_textedit.h | 83 +- third_party/stb/stb_tilemap_editor.h | 40 +- third_party/stb/stb_truetype.h | 638 +- third_party/stb/stb_vorbis.c | 121 +- 14 files changed, 12733 insertions(+), 1021 deletions(-) create mode 100644 third_party/stb/stb_hexwave.h create mode 100644 third_party/stb/stb_image_resize2.h diff --git a/third_party/stb/stb_c_lexer.h b/third_party/stb/stb_c_lexer.h index be34f90..fd42f1c 100644 --- a/third_party/stb/stb_c_lexer.h +++ b/third_party/stb/stb_c_lexer.h @@ -1,4 +1,4 @@ -// stb_c_lexer.h - v0.11 - public domain Sean Barrett 2013 +// stb_c_lexer.h - v0.12 - public domain Sean Barrett 2013 // lexer for making little C-like languages with recursive-descent parsers // // This file provides both the interface and the implementation. @@ -10,6 +10,7 @@ // suffixes on integer constants are not handled (you can override this). // // History: +// 0.12 fix compilation bug for NUL support; better support separate inclusion // 0.11 fix clang static analysis warning // 0.10 fix warnings // 0.09 hex floats, no-stdlib fixes @@ -37,17 +38,25 @@ // Contributors: // Arpad Goretity (bugfix) // Alan Hickman (hex floats) +// github:mundusnine (bugfix) // // LICENSE // // See end of file for license information. +#ifdef STB_C_LEXER_IMPLEMENTATION #ifndef STB_C_LEXER_DEFINITIONS // to change the default parsing rules, copy the following lines // into your C/C++ file *before* including this, and then replace -// the Y's with N's for the ones you don't want. +// the Y's with N's for the ones you don't want. This needs to be +// set to the same values for every place in your program where +// stb_c_lexer.h is included. // --BEGIN-- +#if defined(Y) || defined(N) +#error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined" +#endif + #define STB_C_LEX_C_DECIMAL_INTS Y // "0|[1-9][0-9]*" CLEX_intlit #define STB_C_LEX_C_HEX_INTS Y // "0x[0-9a-fA-F]+" CLEX_intlit #define STB_C_LEX_C_OCTAL_INTS Y // "[0-7]+" CLEX_intlit @@ -95,7 +104,7 @@ #define STB_C_LEXER_DEFINITIONS // This line prevents the header file from replacing your definitions // --END-- - +#endif #endif #ifndef INCLUDE_STB_C_LEXER_H @@ -165,14 +174,44 @@ extern void stb_c_lexer_get_location(const stb_lexer *lexer, const char *where, } #endif -#endif // INCLUDE_STB_C_LEXER_H +enum +{ + CLEX_eof = 256, + CLEX_parse_error, + CLEX_intlit , + CLEX_floatlit , + CLEX_id , + CLEX_dqstring , + CLEX_sqstring , + CLEX_charlit , + CLEX_eq , + CLEX_noteq , + CLEX_lesseq , + CLEX_greatereq , + CLEX_andand , + CLEX_oror , + CLEX_shl , + CLEX_shr , + CLEX_plusplus , + CLEX_minusminus , + CLEX_pluseq , + CLEX_minuseq , + CLEX_muleq , + CLEX_diveq , + CLEX_modeq , + CLEX_andeq , + CLEX_oreq , + CLEX_xoreq , + CLEX_arrow , + CLEX_eqarrow , + CLEX_shleq, CLEX_shreq, -#ifdef STB_C_LEXER_IMPLEMENTATION + CLEX_first_unused_token - #if defined(Y) || defined(N) - #error "Can only use stb_c_lexer in contexts where the preprocessor symbols 'Y' and 'N' are not defined" - #endif +}; +#endif // INCLUDE_STB_C_LEXER_H +#ifdef STB_C_LEXER_IMPLEMENTATION // Hacky definitions so we can easily #if on them #define Y(x) 1 @@ -194,14 +233,6 @@ typedef long stb__clex_int; #define STB__clex_parse_suffixes #endif -#if STB_C_LEX_C_DECIMAL_INTS(x) || STB_C_LEX_C_HEX_INTS(x) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) -#define STB__clex_define_int -#endif - -#if (STB_C_LEX_C_ARITHEQ(x) && STB_C_LEX_C_SHIFTS(x)) || STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) -#define STB__clex_define_shifts -#endif - #if STB_C_LEX_C99_HEX_FLOATS(x) #define STB__clex_hex_floats #endif @@ -231,66 +262,10 @@ typedef long stb__clex_int; #include #endif -// Now pick a definition of Y/N that's conducive to -// defining the enum of token names. -#if STB_C_LEX_DEFINE_ALL_TOKEN_NAMES(x) || defined(STB_C_LEXER_SELF_TEST) - #undef N - #define N(a) Y(a) -#else - #undef N - #define N(a) -#endif - -#undef Y -#define Y(a) a, - -enum -{ - CLEX_eof = 256, - CLEX_parse_error, - -#ifdef STB__clex_define_int - CLEX_intlit, -#endif - - STB_C_LEX_C_DECIMAL_FLOATS( CLEX_floatlit ) - STB_C_LEX_C_IDENTIFIERS( CLEX_id ) - STB_C_LEX_C_DQ_STRINGS( CLEX_dqstring ) - STB_C_LEX_C_SQ_STRINGS( CLEX_sqstring ) - STB_C_LEX_C_CHARS( CLEX_charlit ) - STB_C_LEX_C_COMPARISONS( CLEX_eq ) - STB_C_LEX_C_COMPARISONS( CLEX_noteq ) - STB_C_LEX_C_COMPARISONS( CLEX_lesseq ) - STB_C_LEX_C_COMPARISONS( CLEX_greatereq ) - STB_C_LEX_C_LOGICAL( CLEX_andand ) - STB_C_LEX_C_LOGICAL( CLEX_oror ) - STB_C_LEX_C_SHIFTS( CLEX_shl ) - STB_C_LEX_C_SHIFTS( CLEX_shr ) - STB_C_LEX_C_INCREMENTS( CLEX_plusplus ) - STB_C_LEX_C_INCREMENTS( CLEX_minusminus ) - STB_C_LEX_C_ARITHEQ( CLEX_pluseq ) - STB_C_LEX_C_ARITHEQ( CLEX_minuseq ) - STB_C_LEX_C_ARITHEQ( CLEX_muleq ) - STB_C_LEX_C_ARITHEQ( CLEX_diveq ) - STB_C_LEX_C_ARITHEQ( CLEX_modeq ) - STB_C_LEX_C_BITWISEEQ( CLEX_andeq ) - STB_C_LEX_C_BITWISEEQ( CLEX_oreq ) - STB_C_LEX_C_BITWISEEQ( CLEX_xoreq ) - STB_C_LEX_C_ARROW( CLEX_arrow ) - STB_C_LEX_EQUAL_ARROW( CLEX_eqarrow ) - -#ifdef STB__clex_define_shifts - CLEX_shleq, CLEX_shreq, -#endif - - CLEX_first_unused_token - -#undef Y -#define Y(a) a -}; - // Now for the rest of the file we'll use the basic definition where // where Y expands to its contents and N expands to nothing +#undef Y +#define Y(a) a #undef N #define N(a) @@ -588,7 +563,6 @@ int stb_c_lexer_get_token(stb_lexer *lexer) { int n = 0; lexer->string = lexer->string_storage; - lexer->string_len = n; do { if (n+1 >= lexer->string_storage_len) return stb__clex_token(lexer, CLEX_parse_error, p, p+n); @@ -602,13 +576,14 @@ int stb_c_lexer_get_token(stb_lexer *lexer) STB_C_LEX_DOLLAR_IDENTIFIER( || p[n] == '$' ) ); lexer->string[n] = 0; + lexer->string_len = n; return stb__clex_token(lexer, CLEX_id, p, p+n-1); } // check for EOF STB_C_LEX_0_IS_EOF( if (*p == 0) - return stb__clex_eof(tok); + return stb__clex_eof(lexer); ) single_char: diff --git a/third_party/stb/stb_divide.h b/third_party/stb/stb_divide.h index f8b1f3e..6a51e3f 100644 --- a/third_party/stb/stb_divide.h +++ b/third_party/stb/stb_divide.h @@ -1,8 +1,9 @@ -// stb_divide.h - v0.93 - public domain - Sean Barrett, Feb 2010 +// stb_divide.h - v0.94 - public domain - Sean Barrett, Feb 2010 // Three kinds of divide/modulus of signed integers. // // HISTORY // +// v0.94 Fix integer overflow issues // v0.93 2020-02-02 Write useful exit() value from main() // v0.92 2019-02-25 Fix warning // v0.91 2010-02-27 Fix euclidean division by INT_MIN for non-truncating C @@ -166,15 +167,15 @@ int stb_div_floor(int v1, int v2) return v1/v2; #else if (v1 >= 0 && v2 < 0) { - if ((-v1)+v2+1 < 0) // check if increasing v1's magnitude overflows - return -stb__div(-v1+v2+1,v2); // nope, so just compute it + if (v2 + 1 >= INT_MIN + v1) // check if increasing v1's magnitude overflows + return -stb__div((v2+1)-v1,v2); // nope, so just compute it else return -stb__div(-v1,v2) + ((-v1)%v2 ? -1 : 0); } if (v1 < 0 && v2 >= 0) { if (v1 != INT_MIN) { - if (v1-v2+1 < 0) // check if increasing v1's magnitude overflows - return -stb__div(v1-v2+1,-v2); // nope, so just compute it + if (v1 + 1 >= INT_MIN + v2) // check if increasing v1's magnitude overflows + return -stb__div((v1+1)-v2,-v2); // nope, so just compute it else return -stb__div(-v1,v2) + (stb__mod(v1,-v2) ? -1 : 0); } else // it must be possible to compute -(v1+v2) without overflowing @@ -209,8 +210,10 @@ int stb_div_eucl(int v1, int v2) else // if v1 is INT_MIN, we have to move away from overflow place if (v2 >= 0) q = -stb__div(-(v1+v2),v2)-1, r = -stb__mod(-(v1+v2),v2); - else + else if (v2 != INT_MIN) q = stb__div(-(v1-v2),-v2)+1, r = -stb__mod(-(v1-v2),-v2); + else // for INT_MIN / INT_MIN, we need to be extra-careful to avoid overflow + q = 1, r = 0; #endif if (r >= 0) return q; @@ -228,13 +231,13 @@ int stb_mod_trunc(int v1, int v2) if (r >= 0) return r; else - return r + (v2 > 0 ? v2 : -v2); + return r - (v2 < 0 ? v2 : -v2); } else { // modulus result should always be negative int r = stb__mod(v1,v2); if (r <= 0) return r; else - return r - (v2 > 0 ? v2 : -v2); + return r + (v2 < 0 ? v2 : -v2); } #endif } @@ -267,7 +270,7 @@ int stb_mod_eucl(int v1, int v2) if (r >= 0) return r; else - return r + (v2 > 0 ? v2 : -v2); // abs() + return r - (v2 < 0 ? v2 : -v2); // negative abs() [to avoid overflow] } #ifdef STB_DIVIDE_TEST diff --git a/third_party/stb/stb_ds.h b/third_party/stb/stb_ds.h index d1e0c75..e84c82d 100644 --- a/third_party/stb/stb_ds.h +++ b/third_party/stb/stb_ds.h @@ -1,4 +1,4 @@ -/* stb_ds.h - v0.65 - public domain data structures - Sean Barrett 2019 +/* stb_ds.h - v0.67 - public domain data structures - Sean Barrett 2019 This is a single-header-file library that provides easy-to-use dynamic arrays and hash tables for C (also works in C++). @@ -104,7 +104,7 @@ DOCUMENTATION moving the rest of the array over. Returns b. arrinsn: - void arrins(T* a, int p, int n); + void arrinsn(T* a, int p, int n); Inserts n uninitialized items into array a starting at a[p], moving the rest of the array over. @@ -123,7 +123,7 @@ DOCUMENTATION Deletes the element at a[p], moving the rest of the array over. arrdeln: - void arrdel(T* a, int p, int n); + void arrdeln(T* a, int p, int n); Deletes n elements starting at a[p], moving the rest of the array over. arrdelswap: @@ -379,6 +379,9 @@ CREDITS Andreas Molzer github:hashitaku github:srdjanstipic + Macoy Madson + Andreas Vennstrom + Tobias Mansfield-Williams */ #ifdef STBDS_UNIT_TESTS @@ -489,6 +492,7 @@ extern void stbds_unit_tests(void); // extern void * stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap); +extern void stbds_arrfreef(void *a); extern void stbds_hmfree_func(void *p, size_t elemsize); extern void * stbds_hmget_key(void *a, size_t elemsize, void *key, size_t keysize, int mode); extern void * stbds_hmget_key_ts(void *a, size_t elemsize, void *key, size_t keysize, ptrdiff_t *temp, int mode); @@ -531,24 +535,25 @@ extern void * stbds_shmode_func(size_t elemsize, int mode); #define stbds_temp(t) stbds_header(t)->temp #define stbds_temp_key(t) (*(char **) stbds_header(t)->hash_table) -#define stbds_arrsetcap(a,n) (stbds_arrgrow(a,0,n)) -#define stbds_arrsetlen(a,n) ((stbds_arrcap(a) < (size_t) (n) ? stbds_arrsetcap((a),(size_t)(n)),0 : 0), (a) ? stbds_header(a)->length = (size_t) (n) : 0) -#define stbds_arrcap(a) ((a) ? stbds_header(a)->capacity : 0) -#define stbds_arrlen(a) ((a) ? (ptrdiff_t) stbds_header(a)->length : 0) -#define stbds_arrlenu(a) ((a) ? stbds_header(a)->length : 0) -#define stbds_arrput(a,v) (stbds_arrmaybegrow(a,1), (a)[stbds_header(a)->length++] = (v)) -#define stbds_arrpush stbds_arrput // synonym -#define stbds_arrpop(a) (stbds_header(a)->length--, (a)[stbds_header(a)->length]) -#define stbds_arraddn(a,n) ((void)(stbds_arraddnoff(a, n))) // deprecated, use one of the following instead: -#define stbds_arraddnptr(a,n) (stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)]) -#define stbds_arraddnoff(a,n) (stbds_arrmaybegrow(a,n), stbds_header(a)->length += (n), stbds_header(a)->length-(n)) -#define stbds_arrlast(a) ((a)[stbds_header(a)->length-1]) -#define stbds_arrfree(a) ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL) -#define stbds_arrdel(a,i) stbds_arrdeln(a,i,1) -#define stbds_arrdeln(a,i,n) (memmove(&(a)[i], &(a)[(i)+(n)], sizeof *(a) * (stbds_header(a)->length-(n)-(i))), stbds_header(a)->length -= (n)) -#define stbds_arrdelswap(a,i) ((a)[i] = stbds_arrlast(a), stbds_header(a)->length -= 1) -#define stbds_arrinsn(a,i,n) (stbds_arraddn((a),(n)), memmove(&(a)[(i)+(n)], &(a)[i], sizeof *(a) * (stbds_header(a)->length-(n)-(i)))) -#define stbds_arrins(a,i,v) (stbds_arrinsn((a),(i),1), (a)[i]=(v)) +#define stbds_arrsetcap(a,n) (stbds_arrgrow(a,0,n)) +#define stbds_arrsetlen(a,n) ((stbds_arrcap(a) < (size_t) (n) ? stbds_arrsetcap((a),(size_t)(n)),0 : 0), (a) ? stbds_header(a)->length = (size_t) (n) : 0) +#define stbds_arrcap(a) ((a) ? stbds_header(a)->capacity : 0) +#define stbds_arrlen(a) ((a) ? (ptrdiff_t) stbds_header(a)->length : 0) +#define stbds_arrlenu(a) ((a) ? stbds_header(a)->length : 0) +#define stbds_arrput(a,v) (stbds_arrmaybegrow(a,1), (a)[stbds_header(a)->length++] = (v)) +#define stbds_arrpush stbds_arrput // synonym +#define stbds_arrpop(a) (stbds_header(a)->length--, (a)[stbds_header(a)->length]) +#define stbds_arraddn(a,n) ((void)(stbds_arraddnindex(a, n))) // deprecated, use one of the following instead: +#define stbds_arraddnptr(a,n) (stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), &(a)[stbds_header(a)->length-(n)]) : (a)) +#define stbds_arraddnindex(a,n)(stbds_arrmaybegrow(a,n), (n) ? (stbds_header(a)->length += (n), stbds_header(a)->length-(n)) : stbds_arrlen(a)) +#define stbds_arraddnoff stbds_arraddnindex +#define stbds_arrlast(a) ((a)[stbds_header(a)->length-1]) +#define stbds_arrfree(a) ((void) ((a) ? STBDS_FREE(NULL,stbds_header(a)) : (void)0), (a)=NULL) +#define stbds_arrdel(a,i) stbds_arrdeln(a,i,1) +#define stbds_arrdeln(a,i,n) (memmove(&(a)[i], &(a)[(i)+(n)], sizeof *(a) * (stbds_header(a)->length-(n)-(i))), stbds_header(a)->length -= (n)) +#define stbds_arrdelswap(a,i) ((a)[i] = stbds_arrlast(a), stbds_header(a)->length -= 1) +#define stbds_arrinsn(a,i,n) (stbds_arraddn((a),(n)), memmove(&(a)[(i)+(n)], &(a)[i], sizeof *(a) * (stbds_header(a)->length-(n)-(i)))) +#define stbds_arrins(a,i,v) (stbds_arrinsn((a),(i),1), (a)[i]=(v)) #define stbds_arrmaybegrow(a,n) ((!(a) || stbds_header(a)->length + (n) > stbds_header(a)->capacity) \ ? (stbds_arrgrow(a,n,0),0) : 0) @@ -595,7 +600,7 @@ extern void * stbds_shmode_func(size_t elemsize, int mode); #define stbds_hmget_ts(t, k, temp) (stbds_hmgetp_ts(t,k,temp)->value) #define stbds_hmlen(t) ((t) ? (ptrdiff_t) stbds_header((t)-1)->length-1 : 0) #define stbds_hmlenu(t) ((t) ? stbds_header((t)-1)->length-1 : 0) -#define stbds_hmgetp_null(t,k) (stbds_hmgeti(t,k) == -1 ? NULL : &(t)[stbds_temp(t)-1]) +#define stbds_hmgetp_null(t,k) (stbds_hmgeti(t,k) == -1 ? NULL : &(t)[stbds_temp((t)-1)]) #define stbds_shput(t, k, v) \ ((t) = stbds_hmput_key_wrapper((t), sizeof *(t), (void*) (k), sizeof (t)->key, STBDS_HM_STRING), \ @@ -646,7 +651,7 @@ extern void * stbds_shmode_func(size_t elemsize, int mode); #define stbds_shgets(t, k) (*stbds_shgetp(t,k)) #define stbds_shget(t, k) (stbds_shgetp(t,k)->value) -#define stbds_shgetp_null(t,k) (stbds_shgeti(t,k) == -1 ? NULL : &(t)[stbds_temp(t)-1]) +#define stbds_shgetp_null(t,k) (stbds_shgeti(t,k) == -1 ? NULL : &(t)[stbds_temp((t)-1)]) #define stbds_shlen stbds_hmlen typedef struct @@ -756,8 +761,10 @@ size_t stbds_rehash_items; void *stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap) { + stbds_array_header temp={0}; // force debugging void *b; size_t min_len = stbds_arrlen(a) + addlen; + (void) sizeof(temp); // compute the minimum capacity needed if (min_len > min_cap) @@ -781,6 +788,7 @@ void *stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap) if (a == NULL) { stbds_header(b)->length = 0; stbds_header(b)->hash_table = 0; + stbds_header(b)->temp = 0; } else { STBDS_STATS(++stbds_array_grow); } @@ -789,6 +797,11 @@ void *stbds_arrgrowf(void *a, size_t elemsize, size_t addlen, size_t min_cap) return b; } +void stbds_arrfreef(void *a) +{ + STBDS_FREE(NULL, stbds_header(a)); +} + // // stbds_hm hash table implementation // @@ -1359,7 +1372,6 @@ void *stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int m { size_t hash = mode >= STBDS_HM_STRING ? stbds_hash_string((char*)key,table->seed) : stbds_hash_bytes(key, keysize,table->seed); size_t step = STBDS_BUCKET_LENGTH; - size_t limit,i; size_t pos; ptrdiff_t tombstone = -1; stbds_hash_bucket *bucket; @@ -1370,6 +1382,7 @@ void *stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int m pos = stbds_probe_position(hash, table->slot_count, table->slot_count_log2); for (;;) { + size_t limit, i; STBDS_STATS(++stbds_hash_probes); bucket = &table->storage[pos >> STBDS_BUCKET_SHIFT]; @@ -1378,6 +1391,8 @@ void *stbds_hmput_key(void *a, size_t elemsize, void *key, size_t keysize, int m if (bucket->hash[i] == hash) { if (stbds_is_key_equal(raw_a, elemsize, key, keysize, keyoffset, mode, bucket->index[i])) { stbds_temp(a) = bucket->index[i]; + if (mode >= STBDS_HM_STRING) + stbds_temp_key(a) = * (char **) ((char *) raw_a + elemsize*bucket->index[i] + keyoffset); return STBDS_ARR_TO_HASH(a,elemsize); } } else if (bucket->hash[i] == 0) { diff --git a/third_party/stb/stb_dxt.h b/third_party/stb/stb_dxt.h index 04666de..6150a87 100644 --- a/third_party/stb/stb_dxt.h +++ b/third_party/stb/stb_dxt.h @@ -1,4 +1,4 @@ -// stb_dxt.h - v1.10 - DXT1/DXT5 compressor - public domain +// stb_dxt.h - v1.12 - DXT1/DXT5 compressor - public domain // original by fabian "ryg" giesen - ported to C by stb // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation // @@ -10,6 +10,8 @@ // You can turn on dithering and "high quality" using mode. // // version history: +// v1.12 - (ryg) fix bug in single-color table generator +// v1.11 - (ryg) avoid racy global init, better single-color tables, remove dither // v1.10 - (i.c) various small quality improvements // v1.09 - (stb) update documentation re: surprising alpha channel requirement // v1.08 - (stb) fix bug in dxt-with-alpha block @@ -29,6 +31,7 @@ // Kevin Schmidt (#defines for "freestanding" compilation) // github:ppiastucki (BC4 support) // Ignacio Castano - improve DXT endpoint quantization +// Alan Hickman - static table initialization // // LICENSE // @@ -49,7 +52,7 @@ extern "C" { // compression mode (bitflags) #define STB_DXT_NORMAL 0 -#define STB_DXT_DITHER 1 // use dithering. dubious win. never use for normal maps and the like! +#define STB_DXT_DITHER 1 // use dithering. was always dubious, now deprecated. does nothing! #define STB_DXT_HIGHQUAL 2 // high quality mode, does two refinement steps instead of 1. ~30-40% slower. STBDDEF void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src_rgba_four_bytes_per_pixel, int alpha, int mode); @@ -81,29 +84,82 @@ STBDDEF void stb_compress_bc5_block(unsigned char *dest, const unsigned char *sr #include -#if !defined(STBD_ABS) || !defined(STBI_FABS) +#if !defined(STBD_FABS) #include #endif -#ifndef STBD_ABS -#define STBD_ABS(i) abs(i) -#endif - #ifndef STBD_FABS #define STBD_FABS(x) fabs(x) #endif -#ifndef STBD_MEMSET -#include -#define STBD_MEMSET memset -#endif - -static unsigned char stb__Expand5[32]; -static unsigned char stb__Expand6[64]; -static unsigned char stb__OMatch5[256][2]; -static unsigned char stb__OMatch6[256][2]; -static unsigned char stb__QuantRBTab[256+16]; -static unsigned char stb__QuantGTab[256+16]; +static const unsigned char stb__OMatch5[256][2] = { + { 0, 0 }, { 0, 0 }, { 0, 1 }, { 0, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 1 }, + { 1, 1 }, { 1, 1 }, { 1, 2 }, { 0, 4 }, { 2, 1 }, { 2, 1 }, { 2, 1 }, { 2, 2 }, + { 2, 2 }, { 2, 2 }, { 2, 3 }, { 1, 5 }, { 3, 2 }, { 3, 2 }, { 4, 0 }, { 3, 3 }, + { 3, 3 }, { 3, 3 }, { 3, 4 }, { 3, 4 }, { 3, 4 }, { 3, 5 }, { 4, 3 }, { 4, 3 }, + { 5, 2 }, { 4, 4 }, { 4, 4 }, { 4, 5 }, { 4, 5 }, { 5, 4 }, { 5, 4 }, { 5, 4 }, + { 6, 3 }, { 5, 5 }, { 5, 5 }, { 5, 6 }, { 4, 8 }, { 6, 5 }, { 6, 5 }, { 6, 5 }, + { 6, 6 }, { 6, 6 }, { 6, 6 }, { 6, 7 }, { 5, 9 }, { 7, 6 }, { 7, 6 }, { 8, 4 }, + { 7, 7 }, { 7, 7 }, { 7, 7 }, { 7, 8 }, { 7, 8 }, { 7, 8 }, { 7, 9 }, { 8, 7 }, + { 8, 7 }, { 9, 6 }, { 8, 8 }, { 8, 8 }, { 8, 9 }, { 8, 9 }, { 9, 8 }, { 9, 8 }, + { 9, 8 }, { 10, 7 }, { 9, 9 }, { 9, 9 }, { 9, 10 }, { 8, 12 }, { 10, 9 }, { 10, 9 }, + { 10, 9 }, { 10, 10 }, { 10, 10 }, { 10, 10 }, { 10, 11 }, { 9, 13 }, { 11, 10 }, { 11, 10 }, + { 12, 8 }, { 11, 11 }, { 11, 11 }, { 11, 11 }, { 11, 12 }, { 11, 12 }, { 11, 12 }, { 11, 13 }, + { 12, 11 }, { 12, 11 }, { 13, 10 }, { 12, 12 }, { 12, 12 }, { 12, 13 }, { 12, 13 }, { 13, 12 }, + { 13, 12 }, { 13, 12 }, { 14, 11 }, { 13, 13 }, { 13, 13 }, { 13, 14 }, { 12, 16 }, { 14, 13 }, + { 14, 13 }, { 14, 13 }, { 14, 14 }, { 14, 14 }, { 14, 14 }, { 14, 15 }, { 13, 17 }, { 15, 14 }, + { 15, 14 }, { 16, 12 }, { 15, 15 }, { 15, 15 }, { 15, 15 }, { 15, 16 }, { 15, 16 }, { 15, 16 }, + { 15, 17 }, { 16, 15 }, { 16, 15 }, { 17, 14 }, { 16, 16 }, { 16, 16 }, { 16, 17 }, { 16, 17 }, + { 17, 16 }, { 17, 16 }, { 17, 16 }, { 18, 15 }, { 17, 17 }, { 17, 17 }, { 17, 18 }, { 16, 20 }, + { 18, 17 }, { 18, 17 }, { 18, 17 }, { 18, 18 }, { 18, 18 }, { 18, 18 }, { 18, 19 }, { 17, 21 }, + { 19, 18 }, { 19, 18 }, { 20, 16 }, { 19, 19 }, { 19, 19 }, { 19, 19 }, { 19, 20 }, { 19, 20 }, + { 19, 20 }, { 19, 21 }, { 20, 19 }, { 20, 19 }, { 21, 18 }, { 20, 20 }, { 20, 20 }, { 20, 21 }, + { 20, 21 }, { 21, 20 }, { 21, 20 }, { 21, 20 }, { 22, 19 }, { 21, 21 }, { 21, 21 }, { 21, 22 }, + { 20, 24 }, { 22, 21 }, { 22, 21 }, { 22, 21 }, { 22, 22 }, { 22, 22 }, { 22, 22 }, { 22, 23 }, + { 21, 25 }, { 23, 22 }, { 23, 22 }, { 24, 20 }, { 23, 23 }, { 23, 23 }, { 23, 23 }, { 23, 24 }, + { 23, 24 }, { 23, 24 }, { 23, 25 }, { 24, 23 }, { 24, 23 }, { 25, 22 }, { 24, 24 }, { 24, 24 }, + { 24, 25 }, { 24, 25 }, { 25, 24 }, { 25, 24 }, { 25, 24 }, { 26, 23 }, { 25, 25 }, { 25, 25 }, + { 25, 26 }, { 24, 28 }, { 26, 25 }, { 26, 25 }, { 26, 25 }, { 26, 26 }, { 26, 26 }, { 26, 26 }, + { 26, 27 }, { 25, 29 }, { 27, 26 }, { 27, 26 }, { 28, 24 }, { 27, 27 }, { 27, 27 }, { 27, 27 }, + { 27, 28 }, { 27, 28 }, { 27, 28 }, { 27, 29 }, { 28, 27 }, { 28, 27 }, { 29, 26 }, { 28, 28 }, + { 28, 28 }, { 28, 29 }, { 28, 29 }, { 29, 28 }, { 29, 28 }, { 29, 28 }, { 30, 27 }, { 29, 29 }, + { 29, 29 }, { 29, 30 }, { 29, 30 }, { 30, 29 }, { 30, 29 }, { 30, 29 }, { 30, 30 }, { 30, 30 }, + { 30, 30 }, { 30, 31 }, { 30, 31 }, { 31, 30 }, { 31, 30 }, { 31, 30 }, { 31, 31 }, { 31, 31 }, +}; +static const unsigned char stb__OMatch6[256][2] = { + { 0, 0 }, { 0, 1 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 1, 2 }, { 2, 1 }, { 2, 2 }, + { 2, 2 }, { 2, 3 }, { 3, 2 }, { 3, 3 }, { 3, 3 }, { 3, 4 }, { 4, 3 }, { 4, 4 }, + { 4, 4 }, { 4, 5 }, { 5, 4 }, { 5, 5 }, { 5, 5 }, { 5, 6 }, { 6, 5 }, { 6, 6 }, + { 6, 6 }, { 6, 7 }, { 7, 6 }, { 7, 7 }, { 7, 7 }, { 7, 8 }, { 8, 7 }, { 8, 8 }, + { 8, 8 }, { 8, 9 }, { 9, 8 }, { 9, 9 }, { 9, 9 }, { 9, 10 }, { 10, 9 }, { 10, 10 }, + { 10, 10 }, { 10, 11 }, { 11, 10 }, { 8, 16 }, { 11, 11 }, { 11, 12 }, { 12, 11 }, { 9, 17 }, + { 12, 12 }, { 12, 13 }, { 13, 12 }, { 11, 16 }, { 13, 13 }, { 13, 14 }, { 14, 13 }, { 12, 17 }, + { 14, 14 }, { 14, 15 }, { 15, 14 }, { 14, 16 }, { 15, 15 }, { 15, 16 }, { 16, 14 }, { 16, 15 }, + { 17, 14 }, { 16, 16 }, { 16, 17 }, { 17, 16 }, { 18, 15 }, { 17, 17 }, { 17, 18 }, { 18, 17 }, + { 20, 14 }, { 18, 18 }, { 18, 19 }, { 19, 18 }, { 21, 15 }, { 19, 19 }, { 19, 20 }, { 20, 19 }, + { 20, 20 }, { 20, 20 }, { 20, 21 }, { 21, 20 }, { 21, 21 }, { 21, 21 }, { 21, 22 }, { 22, 21 }, + { 22, 22 }, { 22, 22 }, { 22, 23 }, { 23, 22 }, { 23, 23 }, { 23, 23 }, { 23, 24 }, { 24, 23 }, + { 24, 24 }, { 24, 24 }, { 24, 25 }, { 25, 24 }, { 25, 25 }, { 25, 25 }, { 25, 26 }, { 26, 25 }, + { 26, 26 }, { 26, 26 }, { 26, 27 }, { 27, 26 }, { 24, 32 }, { 27, 27 }, { 27, 28 }, { 28, 27 }, + { 25, 33 }, { 28, 28 }, { 28, 29 }, { 29, 28 }, { 27, 32 }, { 29, 29 }, { 29, 30 }, { 30, 29 }, + { 28, 33 }, { 30, 30 }, { 30, 31 }, { 31, 30 }, { 30, 32 }, { 31, 31 }, { 31, 32 }, { 32, 30 }, + { 32, 31 }, { 33, 30 }, { 32, 32 }, { 32, 33 }, { 33, 32 }, { 34, 31 }, { 33, 33 }, { 33, 34 }, + { 34, 33 }, { 36, 30 }, { 34, 34 }, { 34, 35 }, { 35, 34 }, { 37, 31 }, { 35, 35 }, { 35, 36 }, + { 36, 35 }, { 36, 36 }, { 36, 36 }, { 36, 37 }, { 37, 36 }, { 37, 37 }, { 37, 37 }, { 37, 38 }, + { 38, 37 }, { 38, 38 }, { 38, 38 }, { 38, 39 }, { 39, 38 }, { 39, 39 }, { 39, 39 }, { 39, 40 }, + { 40, 39 }, { 40, 40 }, { 40, 40 }, { 40, 41 }, { 41, 40 }, { 41, 41 }, { 41, 41 }, { 41, 42 }, + { 42, 41 }, { 42, 42 }, { 42, 42 }, { 42, 43 }, { 43, 42 }, { 40, 48 }, { 43, 43 }, { 43, 44 }, + { 44, 43 }, { 41, 49 }, { 44, 44 }, { 44, 45 }, { 45, 44 }, { 43, 48 }, { 45, 45 }, { 45, 46 }, + { 46, 45 }, { 44, 49 }, { 46, 46 }, { 46, 47 }, { 47, 46 }, { 46, 48 }, { 47, 47 }, { 47, 48 }, + { 48, 46 }, { 48, 47 }, { 49, 46 }, { 48, 48 }, { 48, 49 }, { 49, 48 }, { 50, 47 }, { 49, 49 }, + { 49, 50 }, { 50, 49 }, { 52, 46 }, { 50, 50 }, { 50, 51 }, { 51, 50 }, { 53, 47 }, { 51, 51 }, + { 51, 52 }, { 52, 51 }, { 52, 52 }, { 52, 52 }, { 52, 53 }, { 53, 52 }, { 53, 53 }, { 53, 53 }, + { 53, 54 }, { 54, 53 }, { 54, 54 }, { 54, 54 }, { 54, 55 }, { 55, 54 }, { 55, 55 }, { 55, 55 }, + { 55, 56 }, { 56, 55 }, { 56, 56 }, { 56, 56 }, { 56, 57 }, { 57, 56 }, { 57, 57 }, { 57, 57 }, + { 57, 58 }, { 58, 57 }, { 58, 58 }, { 58, 58 }, { 58, 59 }, { 59, 58 }, { 59, 59 }, { 59, 59 }, + { 59, 60 }, { 60, 59 }, { 60, 60 }, { 60, 60 }, { 60, 61 }, { 61, 60 }, { 61, 61 }, { 61, 61 }, + { 61, 62 }, { 62, 61 }, { 62, 62 }, { 62, 62 }, { 62, 63 }, { 63, 62 }, { 63, 63 }, { 63, 63 }, +}; static int stb__Mul8Bit(int a, int b) { @@ -117,9 +173,10 @@ static void stb__From16Bit(unsigned char *out, unsigned short v) int gv = (v & 0x07e0) >> 5; int bv = (v & 0x001f) >> 0; - out[0] = stb__Expand5[rv]; - out[1] = stb__Expand6[gv]; - out[2] = stb__Expand5[bv]; + // expand to 8 bits via bit replication + out[0] = (rv * 33) >> 2; + out[1] = (gv * 65) >> 4; + out[2] = (bv * 33) >> 2; out[3] = 0; } @@ -151,35 +208,6 @@ static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char /****************************************************************************/ -// compute table to reproduce constant colors as accurately as possible -static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size) -{ - int i,mn,mx; - for (i=0;i<256;i++) { - int bestErr = 256; - for (mn=0;mn> 4)]; - ep1[0] = bp[ 0] - dp[ 0]; - dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)]; - ep1[1] = bp[ 4] - dp[ 4]; - dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)]; - ep1[2] = bp[ 8] - dp[ 8]; - dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)]; - ep1[3] = bp[12] - dp[12]; - bp += 16; - dp += 16; - et = ep1, ep1 = ep2, ep2 = et; // swap - } - } -} - // The color matching function -static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither) +static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color) { unsigned int mask = 0; int dirr = color[0*4+0] - color[1*4+0]; @@ -246,68 +246,14 @@ static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *c halfPoint = (stops[3] + stops[2]); c3Point = (stops[2] + stops[0]); - if(!dither) { - // the version without dithering is straightforward - for (i=15;i>=0;i--) { - int dot = dots[i]*2; - mask <<= 2; + for (i=15;i>=0;i--) { + int dot = dots[i]*2; + mask <<= 2; - if(dot < halfPoint) - mask |= (dot < c0Point) ? 1 : 3; - else - mask |= (dot < c3Point) ? 2 : 0; - } - } else { - // with floyd-steinberg dithering - int err[8],*ep1 = err,*ep2 = err+4; - int *dp = dots, y; - - c0Point <<= 3; - halfPoint <<= 3; - c3Point <<= 3; - for(i=0;i<8;i++) - err[i] = 0; - - for(y=0;y<4;y++) - { - int dot,lmask,step; - - dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]); - if(dot < halfPoint) - step = (dot < c0Point) ? 1 : 3; - else - step = (dot < c3Point) ? 2 : 0; - ep1[0] = dp[0] - stops[step]; - lmask = step; - - dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]); - if(dot < halfPoint) - step = (dot < c0Point) ? 1 : 3; - else - step = (dot < c3Point) ? 2 : 0; - ep1[1] = dp[1] - stops[step]; - lmask |= step<<2; - - dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]); - if(dot < halfPoint) - step = (dot < c0Point) ? 1 : 3; - else - step = (dot < c3Point) ? 2 : 0; - ep1[2] = dp[2] - stops[step]; - lmask |= step<<4; - - dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]); - if(dot < halfPoint) - step = (dot < c0Point) ? 1 : 3; - else - step = (dot < c3Point) ? 2 : 0; - ep1[3] = dp[3] - stops[step]; - lmask |= step<<6; - - dp += 4; - mask |= lmask << (y*8); - { int *et = ep1; ep1 = ep2; ep2 = et; } // swap - } + if(dot < halfPoint) + mask |= (dot < c0Point) ? 1 : 3; + else + mask |= (dot < c3Point) ? 2 : 0; } return mask; @@ -316,7 +262,7 @@ static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *c // The color optimization function. (Clever code, part 1) static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16) { - int mind = 0x7fffffff,maxd = -0x7fffffff; + int mind,maxd; unsigned char *minp, *maxp; double magn; int v_r,v_g,v_b; @@ -398,8 +344,10 @@ static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax1 v_b = (int) (vfb * magn); } + minp = maxp = block; + mind = maxd = block[0]*v_r + block[1]*v_g + block[2]*v_b; // Pick colors at extreme points - for(i=0;i<16;i++) + for(i=1;i<16;i++) { int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b; @@ -418,12 +366,12 @@ static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax1 *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]); } -static const float midpoints5[32] = { +static const float stb__midpoints5[32] = { 0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f, 0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f }; -static const float midpoints6[64] = { +static const float stb__midpoints6[64] = { 0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, @@ -435,7 +383,7 @@ static unsigned short stb__Quantize5(float x) unsigned short q; x = x < 0 ? 0 : x > 1 ? 1 : x; // saturate q = (unsigned short)(x * 31); - q += (x > midpoints5[q]); + q += (x > stb__midpoints5[q]); return q; } @@ -444,7 +392,7 @@ static unsigned short stb__Quantize6(float x) unsigned short q; x = x < 0 ? 0 : x > 1 ? 1 : x; // saturate q = (unsigned short)(x * 63); - q += (x > midpoints6[q]); + q += (x > stb__midpoints6[q]); return q; } @@ -532,12 +480,10 @@ static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, i { unsigned int mask; int i; - int dither; int refinecount; unsigned short max16, min16; - unsigned char dblock[16*4],color[4*4]; + unsigned char color[4*4]; - dither = mode & STB_DXT_DITHER; refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1; // check if block is constant @@ -551,15 +497,11 @@ static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, i max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0]; min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1]; } else { - // first step: compute dithered version for PCA if desired - if(dither) - stb__DitherBlock(dblock,block); - - // second step: pca+map along principal axis - stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16); + // first step: PCA+map along principal axis + stb__OptimizeColorsBlock(block,&max16,&min16); if (max16 != min16) { stb__EvalColors(color,max16,min16); - mask = stb__MatchColorsBlock(block,color,dither); + mask = stb__MatchColorsBlock(block,color); } else mask = 0; @@ -567,10 +509,10 @@ static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, i for (i=0;i>2)); - - for(i=0;i<64;i++) - stb__Expand6[i] = (unsigned char)((i<<2)|(i>>4)); - - for(i=0;i<256+16;i++) - { - int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8; - stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)]; - stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)]; - } - - stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32); - stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64); -} - void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode) { unsigned char data[16][4]; - static int init=1; - if (init) { - stb__InitDXT(); - init=0; - } - if (alpha) { int i; stb__CompressAlphaBlock(dest,(unsigned char*) src+3, 4); @@ -710,6 +626,56 @@ void stb_compress_bc5_block(unsigned char *dest, const unsigned char *src) } #endif // STB_DXT_IMPLEMENTATION +// Compile with STB_DXT_IMPLEMENTATION and STB_DXT_GENERATE_TABLES +// defined to generate the tables above. +#ifdef STB_DXT_GENERATE_TABLES +#include + +int main() +{ + int i, j; + const char *omatch_names[] = { "stb__OMatch5", "stb__OMatch6" }; + int dequant_mults[2] = { 33*4, 65 }; // .4 fixed-point dequant multipliers + + // optimal endpoint tables + for (i = 0; i < 2; ++i) { + int dequant = dequant_mults[i]; + int size = i ? 64 : 32; + printf("static const unsigned char %s[256][2] = {\n", omatch_names[i]); + for (int j = 0; j < 256; ++j) { + int mn, mx; + int best_mn = 0, best_mx = 0; + int best_err = 256 * 100; + for (mn=0;mn> 4; + int maxe = (mx * dequant) >> 4; + int err = abs(stb__Lerp13(maxe, mine) - j) * 100; + + // DX10 spec says that interpolation must be within 3% of "correct" result, + // add this as error term. Normally we'd expect a random distribution of + // +-1.5% error, but nowhere in the spec does it say that the error has to be + // unbiased - better safe than sorry. + err += abs(maxe - mine) * 3; + + if(err < best_err) { + best_mn = mn; + best_mx = mx; + best_err = err; + } + } + } + if ((j % 8) == 0) printf(" "); // 2 spaces, third is done below + printf(" { %2d, %2d },", best_mx, best_mn); + if ((j % 8) == 7) printf("\n"); + } + printf("};\n"); + } + + return 0; +} +#endif + /* ------------------------------------------------------------------------------ This software is available under 2 licenses -- choose whichever you prefer. diff --git a/third_party/stb/stb_hexwave.h b/third_party/stb/stb_hexwave.h new file mode 100644 index 0000000..480ab1b --- /dev/null +++ b/third_party/stb/stb_hexwave.h @@ -0,0 +1,680 @@ +// stb_hexwave - v0.5 - public domain, initial release 2021-04-01 +// +// A flexible anti-aliased (bandlimited) digital audio oscillator. +// +// This library generates waveforms of a variety of shapes made of +// line segments. It does not do envelopes, LFO effects, etc.; it +// merely tries to solve the problem of generating an artifact-free +// morphable digital waveform with a variety of spectra, and leaves +// it to the user to rescale the waveform and mix multiple voices, etc. +// +// Compiling: +// +// In one C/C++ file that #includes this file, do +// +// #define STB_HEXWAVE_IMPLEMENTATION +// #include "stb_hexwave.h" +// +// Optionally, #define STB_HEXWAVE_STATIC before including +// the header to cause the definitions to be private to the +// implementation file (i.e. to be "static" instead of "extern"). +// +// Notes: +// +// Optionally performs memory allocation during initialization, +// never allocates otherwise. +// +// License: +// +// See end of file for license information. +// +// Usage: +// +// Initialization: +// +// hexwave_init(32,16,NULL); // read "header section" for alternatives +// +// Create oscillator: +// +// HexWave *osc = malloc(sizeof(*osc)); // or "new HexWave", or declare globally or on stack +// hexwave_create(osc, reflect_flag, peak_time, half_height, zero_wait); +// see "Waveform shapes" below for the meaning of these parameters +// +// Generate audio: +// +// hexwave_generate_samples(output, number_of_samples, osc, oscillator_freq) +// where: +// output is a buffer where the library will store floating point audio samples +// number_of_samples is the number of audio samples to generate +// osc is a pointer to a Hexwave +// oscillator_freq is the frequency of the oscillator divided by the sample rate +// +// The output samples will continue from where the samples generated by the +// previous hexwave_generate_samples() on this oscillator ended. +// +// Change oscillator waveform: +// +// hexwave_change(osc, reflect_flag, peak_time, half_height, zero_wait); +// can call in between calls to hexwave_generate_samples +// +// Waveform shapes: +// +// All waveforms generated by hexwave are constructed from six line segments +// characterized by 3 parameters. +// +// See demonstration: https://www.youtube.com/watch?v=hsUCrAsDN-M +// +// reflect=0 reflect=1 +// +// 0-----P---1 0-----P---1 peak_time = P +// . 1 . 1 +// /\_ : /\_ : +// / \_ : / \_ : +// / \.H / \.H half_height = H +// / | : / | : +// _____/ |_:___ _____/ | : _____ +// . : \ | . | : / +// . : \ | . | : / +// . : \ _/ . \_: / +// . : \ _/ . :_ / +// . -1 \/ . -1 \/ +// 0 - Z - - - - 1 0 - Z - - - - 1 zero_wait = Z +// +// Classic waveforms: +// peak half zero +// reflect time height wait +// Sawtooth 1 0 0 0 +// Square 1 0 1 0 +// Triangle 1 0.5 0 0 +// +// Some waveforms can be produced in multiple ways, which is useful when morphing +// into other waveforms, and there are a few more notable shapes: +// +// peak half zero +// reflect time height wait +// Sawtooth 1 1 any 0 +// Sawtooth (8va) 1 0 -1 0 +// Triangle 1 0.5 0 0 +// Square 1 0 1 0 +// Square 0 0 1 0 +// Triangle 0 0.5 0 0 +// Triangle 0 0 -1 0 +// AlternatingSaw 0 0 0 0 +// AlternatingSaw 0 1 any 0 +// Stairs 0 0 1 0.5 +// +// The "Sawtooth (8va)" waveform is identical to a sawtooth wave with 2x the +// frequency, but when morphed with other values, it becomes an overtone of +// the base frequency. +// +// Morphing waveforms: +// +// Sweeping peak_time morphs the waveform while producing various spectra. +// Sweeping half_height effectively crossfades between two waveforms; useful, but less exciting. +// Sweeping zero_wait produces a similar effect no matter the reset of the waveform, +// a sort of high-pass/PWM effect where the wave becomes silent at zero_wait=1. +// +// You can trivially morph between any two waveforms from the above table +// which only differ in one column. +// +// Crossfade between classic waveforms: +// peak half zero +// Start End reflect time height wait +// ----- --- ------- ---- ------ ---- +// Triangle Square 0 0 -1..1 0 +// Saw Square 1 0 0..1 0 +// Triangle Saw 1 0.5 0..2 0 +// +// The last morph uses uses half-height values larger than 1, which means it will +// be louder and the output should be scaled down by half to compensate, or better +// by dynamically tracking the morph: volume_scale = 1 - half_height/4 +// +// Non-crossfade morph between classic waveforms, most require changing +// two parameters at the same time: +// peak half zero +// Start End reflect time height wait +// ----- --- ------- ---- ------ ---- +// Square Triangle any 0..0.5 1..0 0 +// Square Saw 1 0..1 1..any 0 +// Triangle Saw 1 0.5..1 0..-1 0 +// +// Other noteworthy morphs between simple shapes: +// peak half zero +// Start Halfway End reflect time height wait +// ----- --------- --- ------- ---- ------ ---- +// Saw (8va,neg) Saw (pos) 1 0..1 -1 0 +// Saw (neg) Saw (pos) 1 0..1 0 0 +// Triangle AlternatingSaw 0 0..1 -1 0 +// AlternatingSaw Triangle AlternatingSaw 0 0..1 0 0 +// Square AlternatingSaw 0 0..1 1 0 +// Triangle Triangle AlternatingSaw 0 0..1 -1..1 0 +// Square AlternatingSaw 0 0..1 1..0 0 +// Saw (8va) Triangle Saw 1 0..1 -1..1 0 +// Saw (neg) Saw (pos) 1 0..1 0..1 0 +// AlternatingSaw AlternatingSaw 0 0..1 0..any 0 +// +// The last entry is noteworthy because the morph from the halfway point to either +// endpoint sounds very different. For example, an LFO sweeping back and forth over +// the whole range will morph between the middle timbre and the AlternatingSaw +// timbre in two different ways, alternating. +// +// Entries with "any" for half_height are whole families of morphs, as you can pick +// any value you want as the endpoint for half_height. +// +// You can always morph between any two waveforms with the same value of 'reflect' +// by just sweeping the parameters simultaneously. There will never be artifacts +// and the result will always be useful, if not necessarily what you want. +// +// You can vary the sound of two-parameter morphs by ramping them differently, +// e.g. if the morph goes from t=0..1, then square-to-triangle looks like: +// peak_time = lerp(t, 0, 0.5) +// half_height = lerp(t, 1, 0 ) +// but you can also do things like: +// peak_time = lerp(smoothstep(t), 0, 0.5) +// half_height = cos(PI/2 * t) +// +// How it works: +// +// hexwave use BLEP to bandlimit discontinuities and BLAMP +// to bandlimit C1 discontinuities. This is not polyBLEP +// (polynomial BLEP), it is table-driven BLEP. It is +// also not minBLEP (minimum-phase BLEP), as that complicates +// things for little benefit once BLAMP is involved. +// +// The previous oscillator frequency is remembered, and when +// the frequency changes, a BLAMP is generated to remove the +// C1 discontinuity, which reduces artifacts for sweeps/LFO. +// +// Changes to an oscillator timbre using hexwave_change() actually +// wait until the oscillator finishes its current cycle. All +// waveforms with non-zero "zero_wait" settings pass through 0 +// and have 0-slope at the start of a cycle, which means changing +// the settings is artifact free at that time. (If zero_wait is 0, +// the code still treats it as passing through 0 with 0-slope; it'll +// apply the necessary fixups to make it artifact free as if it does +// transition to 0 with 0-slope vs. the waveform at the end of +// the cycle, then adds the fixups for a non-0 and non-0 slope +// at the start of the cycle, which cancels out if zero_wait is 0, +// and still does the right thing if zero_wait is 0 when the +// settings are updated.) +// +// BLEP/BLAMP normally requires overlapping buffers, but this +// is hidden from the user by generating the waveform to a +// temporary buffer and saving the overlap regions internally +// between calls. (It is slightly more complicated; see code.) +// +// By design all shapes have 0 DC offset; this is one reason +// hexwave uses zero_wait instead of standard PWM. +// +// The internals of hexwave could support any arbitrary shape +// made of line segments, but I chose not to expose this +// generality in favor of a simple, easy-to-use API. + +#ifndef STB_INCLUDE_STB_HEXWAVE_H +#define STB_INCLUDE_STB_HEXWAVE_H + +#ifndef STB_HEXWAVE_MAX_BLEP_LENGTH +#define STB_HEXWAVE_MAX_BLEP_LENGTH 64 // good enough for anybody +#endif + +#ifdef STB_HEXWAVE_STATIC +#define STB_HEXWAVE_DEF static +#else +#define STB_HEXWAVE_DEF extern +#endif + +typedef struct HexWave HexWave; + +STB_HEXWAVE_DEF void hexwave_init(int width, int oversample, float *user_buffer); +// width: size of BLEP, from 4..64, larger is slower & more memory but less aliasing +// oversample: 2+, number of subsample positions, larger uses more memory but less noise +// user_buffer: optional, if provided the library will perform no allocations. +// 16*width*(oversample+1) bytes, must stay allocated as long as library is used +// technically it only needs: 8*( width * (oversample + 1)) +// + 8*((width * oversample) + 1) bytes +// +// width can be larger than 64 if you define STB_HEXWAVE_MAX_BLEP_LENGTH to a larger value + +STB_HEXWAVE_DEF void hexwave_shutdown(float *user_buffer); +// user_buffer: pass in same parameter as passed to hexwave_init + +STB_HEXWAVE_DEF void hexwave_create(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait); +// see docs above for description +// +// reflect is tested as 0 or non-zero +// peak_time is clamped to 0..1 +// half_height is not clamped +// zero_wait is clamped to 0..1 + +STB_HEXWAVE_DEF void hexwave_change(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait); +// see docs + +STB_HEXWAVE_DEF void hexwave_generate_samples(float *output, int num_samples, HexWave *hex, float freq); +// output: buffer where the library will store generated floating point audio samples +// number_of_samples: the number of audio samples to generate +// osc: pointer to a Hexwave initialized with 'hexwave_create' +// oscillator_freq: frequency of the oscillator divided by the sample rate + +// private: +typedef struct +{ + int reflect; + float peak_time; + float zero_wait; + float half_height; +} HexWaveParameters; + +struct HexWave +{ + float t, prev_dt; + HexWaveParameters current, pending; + int have_pending; + float buffer[STB_HEXWAVE_MAX_BLEP_LENGTH]; +}; +#endif + +#ifdef STB_HEXWAVE_IMPLEMENTATION + +#ifndef STB_HEXWAVE_NO_ALLOCATION +#include // malloc,free +#endif + +#include // memset,memcpy,memmove +#include // sin,cos,fabs + +#define hexwave_clamp(v,a,b) ((v) < (a) ? (a) : (v) > (b) ? (b) : (v)) + +STB_HEXWAVE_DEF void hexwave_change(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait) +{ + hex->pending.reflect = reflect; + hex->pending.peak_time = hexwave_clamp(peak_time,0,1); + hex->pending.half_height = half_height; + hex->pending.zero_wait = hexwave_clamp(zero_wait,0,1); + // put a barrier here to allow changing from a different thread than the generator + hex->have_pending = 1; +} + +STB_HEXWAVE_DEF void hexwave_create(HexWave *hex, int reflect, float peak_time, float half_height, float zero_wait) +{ + memset(hex, 0, sizeof(*hex)); + hexwave_change(hex, reflect, peak_time, half_height, zero_wait); + hex->current = hex->pending; + hex->have_pending = 0; + hex->t = 0; + hex->prev_dt = 0; +} + +static struct +{ + int width; // width of fixup in samples + int oversample; // number of oversampled versions (there's actually one more to allow lerpign) + float *blep; + float *blamp; +} hexblep; + +static void hex_add_oversampled_bleplike(float *output, float time_since_transition, float scale, float *data) +{ + float *d1,*d2; + float lerpweight; + int i, bw = hexblep.width; + + int slot = (int) (time_since_transition * hexblep.oversample); + if (slot >= hexblep.oversample) + slot = hexblep.oversample-1; // clamp in case the floats overshoot + + d1 = &data[ slot *bw]; + d2 = &data[(slot+1)*bw]; + + lerpweight = time_since_transition * hexblep.oversample - slot; + for (i=0; i < bw; ++i) + output[i] += scale * (d1[i] + (d2[i]-d1[i])*lerpweight); +} + +static void hex_blep (float *output, float time_since_transition, float scale) +{ + hex_add_oversampled_bleplike(output, time_since_transition, scale, hexblep.blep); +} + +static void hex_blamp(float *output, float time_since_transition, float scale) +{ + hex_add_oversampled_bleplike(output, time_since_transition, scale, hexblep.blamp); +} + +typedef struct +{ + float t,v,s; // time, value, slope +} hexvert; + +// each half of the waveform needs 4 vertices to represent 3 line +// segments, plus 1 more for wraparound +static void hexwave_generate_linesegs(hexvert vert[9], HexWave *hex, float dt) +{ + int j; + float min_len = dt / 256.0f; + + vert[0].t = 0; + vert[0].v = 0; + vert[1].t = hex->current.zero_wait*0.5f; + vert[1].v = 0; + vert[2].t = 0.5f*hex->current.peak_time + vert[1].t*(1-hex->current.peak_time); + vert[2].v = 1; + vert[3].t = 0.5f; + vert[3].v = hex->current.half_height; + + if (hex->current.reflect) { + for (j=4; j <= 7; ++j) { + vert[j].t = 1 - vert[7-j].t; + vert[j].v = - vert[7-j].v; + } + } else { + for (j=4; j <= 7; ++j) { + vert[j].t = 0.5f + vert[j-4].t; + vert[j].v = - vert[j-4].v; + } + } + vert[8].t = 1; + vert[8].v = 0; + + for (j=0; j < 8; ++j) { + if (vert[j+1].t <= vert[j].t + min_len) { + // if change takes place over less than a fraction of a sample treat as discontinuity + // + // otherwise the slope computation can blow up to arbitrarily large and we + // try to generate a huge BLAMP and the result is wrong. + // + // why does this happen if the math is right? i believe if done perfectly, + // the two BLAMPs on either side of the slope would cancel out, but our + // BLAMPs have only limited sub-sample precision and limited integration + // accuracy. or maybe it's just the math blowing up w/ floating point precision + // limits as we try to make x * (1/x) cancel out + // + // min_len verified artifact-free even near nyquist with only oversample=4 + vert[j+1].t = vert[j].t; + } + } + + if (vert[8].t != 1.0f) { + // if the above fixup moved the endpoint away from 1.0, move it back, + // along with any other vertices that got moved to the same time + float t = vert[8].t; + for (j=5; j <= 8; ++j) + if (vert[j].t == t) + vert[j].t = 1.0f; + } + + // compute the exact slopes from the final fixed-up positions + for (j=0; j < 8; ++j) + if (vert[j+1].t == vert[j].t) + vert[j].s = 0; + else + vert[j].s = (vert[j+1].v - vert[j].v) / (vert[j+1].t - vert[j].t); + + // wraparound at end + vert[8].t = 1; + vert[8].v = vert[0].v; + vert[8].s = vert[0].s; +} + +STB_HEXWAVE_DEF void hexwave_generate_samples(float *output, int num_samples, HexWave *hex, float freq) +{ + hexvert vert[9]; + int pass,i,j; + float t = hex->t; + float temp_output[2*STB_HEXWAVE_MAX_BLEP_LENGTH]; + int buffered_length = sizeof(float)*hexblep.width; + float dt = (float) fabs(freq); + float recip_dt = (dt == 0.0f) ? 0.0f : 1.0f / dt; + + int halfw = hexblep.width/2; + // all sample times are biased by halfw to leave room for BLEP/BLAMP to go back in time + + if (num_samples <= 0) + return; + + // convert parameters to times and slopes + hexwave_generate_linesegs(vert, hex, dt); + + if (hex->prev_dt != dt) { + // if frequency changes, add a fixup at the derivative discontinuity starting at now + float slope; + for (j=1; j < 6; ++j) + if (t < vert[j].t) + break; + slope = vert[j].s; + if (slope != 0) + hex_blamp(output, 0, (dt - hex->prev_dt)*slope); + hex->prev_dt = dt; + } + + // copy the buffered data from last call and clear the rest of the output array + memset(output, 0, sizeof(float)*num_samples); + memset(temp_output, 0, 2*hexblep.width*sizeof(float)); + + if (num_samples >= hexblep.width) { + memcpy(output, hex->buffer, buffered_length); + } else { + // if the output is shorter than hexblep.width, we do all synthesis to temp_output + memcpy(temp_output, hex->buffer, buffered_length); + } + + for (pass=0; pass < 2; ++pass) { + int i0,i1; + float *out; + + // we want to simulate having one buffer that is num_output + hexblep.width + // samples long, without putting that requirement on the user, and without + // allocating a temp buffer that's as long as the whole thing. so we use two + // overlapping buffers, one the user's buffer and one a fixed-length temp + // buffer. + + if (pass == 0) { + if (num_samples < hexblep.width) + continue; + // run as far as we can without overwriting the end of the user's buffer + out = output; + i0 = 0; + i1 = num_samples - hexblep.width; + } else { + // generate the rest into a temp buffer + out = temp_output; + i0 = 0; + if (num_samples >= hexblep.width) + i1 = hexblep.width; + else + i1 = num_samples; + } + + // determine current segment + for (j=0; j < 8; ++j) + if (t < vert[j+1].t) + break; + + i = i0; + for(;;) { + while (t < vert[j+1].t) { + if (i == i1) + goto done; + out[i+halfw] += vert[j].v + vert[j].s*(t - vert[j].t); + t += dt; + ++i; + } + // transition from lineseg starting at j to lineseg starting at j+1 + + if (vert[j].t == vert[j+1].t) + hex_blep(out+i, recip_dt*(t-vert[j+1].t), (vert[j+1].v - vert[j].v)); + hex_blamp(out+i, recip_dt*(t-vert[j+1].t), dt*(vert[j+1].s - vert[j].s)); + ++j; + + if (j == 8) { + // change to different waveform if there's a change pending + j = 0; + t -= 1.0; // t was >= 1.f if j==8 + if (hex->have_pending) { + float prev_s0 = vert[j].s; + float prev_v0 = vert[j].v; + hex->current = hex->pending; + hex->have_pending = 0; + hexwave_generate_linesegs(vert, hex, dt); + // the following never occurs with this oscillator, but it makes + // the code work in more general cases + if (vert[j].v != prev_v0) + hex_blep (out+i, recip_dt*t, (vert[j].v - prev_v0)); + if (vert[j].s != prev_s0) + hex_blamp(out+i, recip_dt*t, dt*(vert[j].s - prev_s0)); + } + } + } + done: + ; + } + + // at this point, we've written output[] and temp_output[] + if (num_samples >= hexblep.width) { + // the first half of temp[] overlaps the end of output, the second half will be the new start overlap + for (i=0; i < hexblep.width; ++i) + output[num_samples-hexblep.width + i] += temp_output[i]; + memcpy(hex->buffer, temp_output+hexblep.width, buffered_length); + } else { + for (i=0; i < num_samples; ++i) + output[i] = temp_output[i]; + memcpy(hex->buffer, temp_output+num_samples, buffered_length); + } + + hex->t = t; +} + +STB_HEXWAVE_DEF void hexwave_shutdown(float *user_buffer) +{ + #ifndef STB_HEXWAVE_NO_ALLOCATION + if (user_buffer != 0) { + free(hexblep.blep); + free(hexblep.blamp); + } + #endif +} + +// buffer should be NULL or must be 4*(width*(oversample+1)*2 + +STB_HEXWAVE_DEF void hexwave_init(int width, int oversample, float *user_buffer) +{ + int halfwidth = width/2; + int half = halfwidth*oversample; + int blep_buffer_count = width*(oversample+1); + int n = 2*half+1; +#ifdef STB_HEXWAVE_NO_ALLOCATION + float *buffers = user_buffer; +#else + float *buffers = user_buffer ? user_buffer : (float *) malloc(sizeof(float) * n * 2); +#endif + float *step = buffers+0*n; + float *ramp = buffers+1*n; + float *blep_buffer, *blamp_buffer; + double integrate_impulse=0, integrate_step=0; + int i,j; + + if (width > STB_HEXWAVE_MAX_BLEP_LENGTH) + width = STB_HEXWAVE_MAX_BLEP_LENGTH; + + if (user_buffer == 0) { + #ifndef STB_HEXWAVE_NO_ALLOCATION + blep_buffer = (float *) malloc(sizeof(float)*blep_buffer_count); + blamp_buffer = (float *) malloc(sizeof(float)*blep_buffer_count); + #endif + } else { + blep_buffer = ramp+n; + blamp_buffer = blep_buffer + blep_buffer_count; + } + + // compute BLEP and BLAMP by integerating windowed sinc + for (i=0; i < n; ++i) { + for (j=0; j < 16; ++j) { + float sinc_t = 3.141592f* (i-half) / oversample; + float sinc = (i==half) ? 1.0f : (float) sin(sinc_t) / (sinc_t); + float wt = 2.0f*3.1415926f * i / (n-1); + float window = (float) (0.355768 - 0.487396*cos(wt) + 0.144232*cos(2*wt) - 0.012604*cos(3*wt)); // Nuttall + double value = window * sinc; + integrate_impulse += value/16; + integrate_step += integrate_impulse/16; + } + step[i] = (float) integrate_impulse; + ramp[i] = (float) integrate_step; + } + + // renormalize + for (i=0; i < n; ++i) { + step[i] = step[i] * (float) (1.0 / step[n-1]); // step needs to reach to 1.0 + ramp[i] = ramp[i] * (float) (halfwidth / ramp[n-1]); // ramp needs to become a slope of 1.0 after oversampling + } + + // deinterleave to allow efficient interpolation e.g. w/SIMD + for (j=0; j <= oversample; ++j) { + for (i=0; i < width; ++i) { + blep_buffer [j*width+i] = step[j+i*oversample]; + blamp_buffer[j*width+i] = ramp[j+i*oversample]; + } + } + + // subtract out the naive waveform; note we can't do this to the raw data + // above, because we want the discontinuity to be in a different locations + // for j=0 and j=oversample (which exists to provide something to interpolate against) + for (j=0; j <= oversample; ++j) { + // subtract step + for (i=halfwidth; i < width; ++i) + blep_buffer [j*width+i] -= 1.0f; + // subtract ramp + for (i=halfwidth; i < width; ++i) + blamp_buffer[j*width+i] -= (j+i*oversample-half)*(1.0f/oversample); + } + + hexblep.blep = blep_buffer; + hexblep.blamp = blamp_buffer; + hexblep.width = width; + hexblep.oversample = oversample; + + #ifndef STB_HEXWAVE_NO_ALLOCATION + if (user_buffer == 0) + free(buffers); + #endif +} +#endif // STB_HEXWAVE_IMPLEMENTATION + +/* +------------------------------------------------------------------------------ +This software is available under 2 licenses -- choose whichever you prefer. +------------------------------------------------------------------------------ +ALTERNATIVE A - MIT License +Copyright (c) 2017 Sean Barrett +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +------------------------------------------------------------------------------ +ALTERNATIVE B - Public Domain (www.unlicense.org) +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non-commercial, and by any means. +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain. We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors. We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +------------------------------------------------------------------------------ +*/ diff --git a/third_party/stb/stb_image.h b/third_party/stb/stb_image.h index accef48..9eedabe 100644 --- a/third_party/stb/stb_image.h +++ b/third_party/stb/stb_image.h @@ -1,4 +1,4 @@ -/* stb_image - v2.26 - public domain image loader - http://nothings.org/stb +/* stb_image - v2.30 - public domain image loader - http://nothings.org/stb no warranty implied; use at your own risk Do this: @@ -48,6 +48,10 @@ LICENSE RECENT REVISION HISTORY: + 2.30 (2024-05-31) avoid erroneous gcc warning + 2.29 (2023-05-xx) optimizations + 2.28 (2023-01-29) many error fixes, security errors, just tons of stuff + 2.27 (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes 2.26 (2020-07-13) many minor fixes 2.25 (2020-02-02) fix warnings 2.24 (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically @@ -89,7 +93,7 @@ RECENT REVISION HISTORY: Jeremy Sawicki (handle all ImageNet JPGs) Optimizations & bugfixes Mikhail Morozov (1-bit BMP) Fabian "ryg" Giesen Anael Seghezzi (is-16-bit query) - Arseny Kapoulkine + Arseny Kapoulkine Simon Breuss (16-bit PNM) John-Mark Allen Carmelo J Fdez-Aguera @@ -102,19 +106,21 @@ RECENT REVISION HISTORY: Thomas Ruf Ronny Chevalier github:rlyeh Janez Zemva John Bartholomew Michal Cichon github:romigrou Jonathan Blow Ken Hamada Tero Hanninen github:svdijk - Laurent Gomila Cort Stratton github:snagar + Eugene Golushkov Laurent Gomila Cort Stratton github:snagar Aruelien Pocheville Sergio Gonzalez Thibault Reuille github:Zelex Cass Everitt Ryamond Barbiero github:grim210 Paul Du Bois Engin Manap Aldo Culquicondor github:sammyhw Philipp Wiesemann Dale Weiler Oriol Ferrer Mesia github:phprus - Josh Tobin Matthew Gregan github:poppolopoppo + Josh Tobin Neil Bickford Matthew Gregan github:poppolopoppo Julian Raschke Gregory Mullen Christian Floisand github:darealshinji Baldur Karlsson Kevin Schmidt JR Smith github:Michaelangel007 - Brad Weinberger Matvey Cherevko [reserved] + Brad Weinberger Matvey Cherevko github:mosra Luca Sas Alexander Veselov Zack Middleton [reserved] Ryan C. Gordon [reserved] [reserved] DO NOT ADD YOUR NAME HERE + Jacko Dirks + To add your name to the credits, pick a random blank space in the middle and fill it. 80% of merge conflicts on stb PRs are due to people adding their name at the end of the credits. @@ -137,7 +143,7 @@ RECENT REVISION HISTORY: // // ... x = width, y = height, n = # 8-bit components per pixel ... // // ... replace '0' with '1'..'4' to force that many components per pixel // // ... but 'n' will always be the number that it would have been if you said 0 -// stbi_image_free(data) +// stbi_image_free(data); // // Standard parameters: // int *x -- outputs image width in pixels @@ -176,6 +182,32 @@ RECENT REVISION HISTORY: // // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized. // +// To query the width, height and component count of an image without having to +// decode the full file, you can use the stbi_info family of functions: +// +// int x,y,n,ok; +// ok = stbi_info(filename, &x, &y, &n); +// // returns ok=1 and sets x, y, n if image is a supported format, +// // 0 otherwise. +// +// Note that stb_image pervasively uses ints in its public API for sizes, +// including sizes of memory buffers. This is now part of the API and thus +// hard to change without causing breakage. As a result, the various image +// loaders all have certain limits on image size; these differ somewhat +// by format but generally boil down to either just under 2GB or just under +// 1GB. When the decoded image would be larger than this, stb_image decoding +// will fail. +// +// Additionally, stb_image will reject image files that have any of their +// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS, +// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit, +// the only way to have an image with such dimensions load correctly +// is for it to have a rather extreme aspect ratio. Either way, the +// assumption here is that such larger images are likely to be malformed +// or malicious. If you do need to load an image with individual dimensions +// larger than that, and it still fits in the overall size limit, you can +// #define STBI_MAX_DIMENSIONS on your own to be something larger. +// // =========================================================================== // // UNICODE: @@ -281,11 +313,10 @@ RECENT REVISION HISTORY: // // iPhone PNG support: // -// By default we convert iphone-formatted PNGs back to RGB, even though -// they are internally encoded differently. You can disable this conversion -// by calling stbi_convert_iphone_png_to_rgb(0), in which case -// you will always just get the native iphone "format" through (which -// is BGR stored in RGB). +// We optionally support converting iPhone-formatted PNGs (which store +// premultiplied BGRA) back to RGB, even though they're internally encoded +// differently. To enable this conversion, call +// stbi_convert_iphone_png_to_rgb(1). // // Call stbi_set_unpremultiply_on_load(1) as well to force a divide per // pixel to remove any premultiplied alpha *only* if the image file explicitly @@ -489,6 +520,8 @@ STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip); // as above, but only applies to images loaded on the thread that calls the function // this function is only available if your compiler supports thread-local variables; // calling it will fail to link if your compiler doesn't +STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply); +STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert); STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip); // ZLIB client - used by PNG, available for other purposes @@ -605,7 +638,7 @@ STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch #endif #endif -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(__SYMBIAN32__) typedef unsigned short stbi__uint16; typedef signed short stbi__int16; typedef unsigned int stbi__uint32; @@ -634,7 +667,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1]; #ifdef STBI_HAS_LROTL #define stbi_lrot(x,y) _lrotl(x,y) #else - #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) + #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (-(y) & 31))) #endif #if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED)) @@ -748,9 +781,12 @@ static int stbi__sse2_available(void) #ifdef STBI_NEON #include -// assume GCC or Clang on ARM targets +#ifdef _MSC_VER +#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name +#else #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) #endif +#endif #ifndef STBI_SIMD_ALIGN #define STBI_SIMD_ALIGN(type, name) type name @@ -924,6 +960,7 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp); static int stbi__pnm_test(stbi__context *s); static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri); static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp); +static int stbi__pnm_is16(stbi__context *s); #endif static @@ -998,7 +1035,7 @@ static int stbi__mad3sizes_valid(int a, int b, int c, int add) } // returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) +#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM) static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add) { return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) && @@ -1021,7 +1058,7 @@ static void *stbi__malloc_mad3(int a, int b, int c, int add) return stbi__malloc(a*b*c + add); } -#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) +#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM) static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) { if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL; @@ -1029,6 +1066,23 @@ static void *stbi__malloc_mad4(int a, int b, int c, int d, int add) } #endif +// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow. +static int stbi__addints_valid(int a, int b) +{ + if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow + if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0. + return a <= INT_MAX - b; +} + +// returns 1 if the product of two ints fits in a signed short, 0 on overflow. +static int stbi__mul2shorts_valid(int a, int b) +{ + if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow + if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid + if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN + return a >= SHRT_MIN / b; +} + // stbi__err - error // stbi__errpf - error returning pointer to float // stbi__errpuc - error returning pointer to unsigned char @@ -1087,9 +1141,8 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order ri->num_channels = 0; - #ifndef STBI_NO_JPEG - if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri); - #endif + // test the formats with a very explicit header first (at least a FOURCC + // or distinctive magic number first) #ifndef STBI_NO_PNG if (stbi__png_test(s)) return stbi__png_load(s,x,y,comp,req_comp, ri); #endif @@ -1107,6 +1160,13 @@ static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int re #ifndef STBI_NO_PIC if (stbi__pic_test(s)) return stbi__pic_load(s,x,y,comp,req_comp, ri); #endif + + // then the formats that can end up attempting to load with just 1 or 2 + // bytes matching expectations; these are prone to false positives, so + // try them later + #ifndef STBI_NO_JPEG + if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri); + #endif #ifndef STBI_NO_PNM if (stbi__pnm_test(s)) return stbi__pnm_load(s,x,y,comp,req_comp, ri); #endif @@ -1262,12 +1322,12 @@ static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, in #ifndef STBI_NO_STDIO -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide); STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default); #endif -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input) { return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); @@ -1277,16 +1337,16 @@ STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wch static FILE *stbi__fopen(char const *filename, char const *mode) { FILE *f; -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8) wchar_t wMode[64]; wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename))) return 0; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode))) return 0; -#if _MSC_VER >= 1400 +#if defined(_MSC_VER) && _MSC_VER >= 1400 if (0 != _wfopen_s(&f, wFilename, wMode)) f = 0; #else @@ -1662,7 +1722,8 @@ static int stbi__get16le(stbi__context *s) static stbi__uint32 stbi__get32le(stbi__context *s) { stbi__uint32 z = stbi__get16le(s); - return z + (stbi__get16le(s) << 16); + z += (stbi__uint32)stbi__get16le(s) << 16; + return z; } #endif @@ -1944,9 +2005,12 @@ static int stbi__build_huffman(stbi__huffman *h, int *count) int i,j,k=0; unsigned int code; // build size list for each symbol (from JPEG spec) - for (i=0; i < 16; ++i) - for (j=0; j < count[i]; ++j) + for (i=0; i < 16; ++i) { + for (j=0; j < count[i]; ++j) { h->size[k++] = (stbi_uc) (i+1); + if(k >= 257) return stbi__err("bad size list","Corrupt JPEG"); + } + } h->size[k] = 0; // compute actual symbols (from jpeg spec) @@ -2071,6 +2135,8 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) // convert the huffman code to the symbol id c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k]; + if(c < 0 || c >= 256) // symbol id out of bounds! + return -1; STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]); // convert the id to a symbol @@ -2089,14 +2155,14 @@ stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n) unsigned int k; int sgn; if (j->code_bits < n) stbi__grow_buffer_unsafe(j); + if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing - sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB + sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative) k = stbi_lrot(j->code_buffer, n); - if (n < 0 || n >= (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask))) return 0; j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; j->code_bits -= n; - return k + (stbi__jbias[n] & ~sgn); + return k + (stbi__jbias[n] & (sgn - 1)); } // get some unsigned bits @@ -2104,6 +2170,7 @@ stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n) { unsigned int k; if (j->code_bits < n) stbi__grow_buffer_unsafe(j); + if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing k = stbi_lrot(j->code_buffer, n); j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; @@ -2115,6 +2182,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j) { unsigned int k; if (j->code_bits < 1) stbi__grow_buffer_unsafe(j); + if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing k = j->code_buffer; j->code_buffer <<= 1; --j->code_bits; @@ -2146,14 +2214,16 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); t = stbi__jpeg_huff_decode(j, hdc); - if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG"); + if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG"); // 0 all the ac values now so we can do it 32-bits at a time memset(data,0,64*sizeof(data[0])); diff = t ? stbi__extend_receive(j, t) : 0; + if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG"); dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; + if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); data[0] = (short) (dc * dequant[0]); // decode AC components, see JPEG spec @@ -2167,6 +2237,7 @@ static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman if (r) { // fast-AC path k += (r >> 4) & 15; // run s = r & 15; // combined length + if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available"); j->code_buffer <<= s; j->code_bits -= s; // decode into unzigzag'd location @@ -2203,12 +2274,14 @@ static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__ // first scan for DC coefficient, must be first memset(data,0,64*sizeof(data[0])); // 0 all the ac values now t = stbi__jpeg_huff_decode(j, hdc); - if (t == -1) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); diff = t ? stbi__extend_receive(j, t) : 0; + if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG"); dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; - data[0] = (short) (dc << j->succ_low); + if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); + data[0] = (short) (dc * (1 << j->succ_low)); } else { // refinement scan for DC coefficient if (stbi__jpeg_get_bit(j)) @@ -2242,10 +2315,11 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__ if (r) { // fast-AC path k += (r >> 4) & 15; // run s = r & 15; // combined length + if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available"); j->code_buffer <<= s; j->code_bits -= s; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) ((r >> 8) << shift); + data[zig] = (short) ((r >> 8) * (1 << shift)); } else { int rs = stbi__jpeg_huff_decode(j, hac); if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); @@ -2263,7 +2337,7 @@ static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__ } else { k += r; zig = stbi__jpeg_dezigzag[k++]; - data[zig] = (short) (stbi__extend_receive(j,s) << shift); + data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift)); } } } while (k <= j->spec_end); @@ -3062,6 +3136,7 @@ static int stbi__process_marker(stbi__jpeg *z, int m) sizes[i] = stbi__get8(z->s); n += sizes[i]; } + if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values! L -= 17; if (tc == 0) { if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0; @@ -3227,6 +3302,13 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan) if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; } + // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios + // and I've never seen a non-corrupted JPEG file actually use them + for (i=0; i < s->img_n; ++i) { + if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG"); + if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG"); + } + // compute interleaved mcu info z->img_h_max = h_max; z->img_v_max = v_max; @@ -3304,6 +3386,28 @@ static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) return 1; } +static stbi_uc stbi__skip_jpeg_junk_at_end(stbi__jpeg *j) +{ + // some JPEGs have junk at end, skip over it but if we find what looks + // like a valid marker, resume there + while (!stbi__at_eof(j->s)) { + stbi_uc x = stbi__get8(j->s); + while (x == 0xff) { // might be a marker + if (stbi__at_eof(j->s)) return STBI__MARKER_none; + x = stbi__get8(j->s); + if (x != 0x00 && x != 0xff) { + // not a stuffed zero or lead-in to another marker, looks + // like an actual marker, return it + return x; + } + // stuffed zero has x=0 now which ends the loop, meaning we go + // back to regular scan loop. + // repeated 0xff keeps trying to read the next byte of the marker. + } + } + return STBI__MARKER_none; +} + // decode image to YCbCr format static int stbi__decode_jpeg_image(stbi__jpeg *j) { @@ -3320,25 +3424,22 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j) if (!stbi__process_scan_header(j)) return 0; if (!stbi__parse_entropy_coded_data(j)) return 0; if (j->marker == STBI__MARKER_none ) { - // handle 0s at the end of image data from IP Kamera 9060 - while (!stbi__at_eof(j->s)) { - int x = stbi__get8(j->s); - if (x == 255) { - j->marker = stbi__get8(j->s); - break; - } - } + j->marker = stbi__skip_jpeg_junk_at_end(j); // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0 } + m = stbi__get_marker(j); + if (STBI__RESTART(m)) + m = stbi__get_marker(j); } else if (stbi__DNL(m)) { int Ld = stbi__get16be(j->s); stbi__uint32 NL = stbi__get16be(j->s); if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG"); if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG"); + m = stbi__get_marker(j); } else { - if (!stbi__process_marker(j, m)) return 0; + if (!stbi__process_marker(j, m)) return 1; + m = stbi__get_marker(j); } - m = stbi__get_marker(j); } if (j->progressive) stbi__jpeg_finish(j); @@ -3782,6 +3883,10 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp else decode_n = z->s->img_n; + // nothing to do if no components requested; check this now to avoid + // accessing uninitialized coutput[0] later + if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; } + // resample and color-convert { int k; @@ -3924,6 +4029,8 @@ static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int re { unsigned char* result; stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg)); + if (!j) return stbi__errpuc("outofmem", "Out of memory"); + memset(j, 0, sizeof(stbi__jpeg)); STBI_NOTUSED(ri); j->s = s; stbi__setup_jpeg(j); @@ -3936,6 +4043,8 @@ static int stbi__jpeg_test(stbi__context *s) { int r; stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg)); + if (!j) return stbi__err("outofmem", "Out of memory"); + memset(j, 0, sizeof(stbi__jpeg)); j->s = s; stbi__setup_jpeg(j); r = stbi__decode_jpeg_header(j, STBI__SCAN_type); @@ -3960,6 +4069,8 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) { int result; stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg))); + if (!j) return stbi__err("outofmem", "Out of memory"); + memset(j, 0, sizeof(stbi__jpeg)); j->s = s; result = stbi__jpeg_info_raw(j, x, y, comp); STBI_FREE(j); @@ -3979,6 +4090,7 @@ static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp) // fast-way is faster to check than jpeg huffman, but slow way is slower #define STBI__ZFAST_BITS 9 // accelerate all cases in default tables #define STBI__ZFAST_MASK ((1 << STBI__ZFAST_BITS) - 1) +#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet // zlib-style huffman encoding // (jpegs packs from left, zlib from right, so can't share code) @@ -3988,8 +4100,8 @@ typedef struct stbi__uint16 firstcode[16]; int maxcode[17]; stbi__uint16 firstsymbol[16]; - stbi_uc size[288]; - stbi__uint16 value[288]; + stbi_uc size[STBI__ZNSYMS]; + stbi__uint16 value[STBI__ZNSYMS]; } stbi__zhuffman; stbi_inline static int stbi__bitreverse16(int n) @@ -4066,6 +4178,7 @@ typedef struct { stbi_uc *zbuffer, *zbuffer_end; int num_bits; + int hit_zeof_once; stbi__uint32 code_buffer; char *zout; @@ -4120,7 +4233,7 @@ static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z) if (s >= 16) return -1; // invalid code! // code size is s, so: b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s]; - if (b >= sizeof (z->size)) return -1; // some data was corrupt somewhere! + if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere! if (z->size[b] != s) return -1; // was originally an assert, but report failure instead. a->code_buffer >>= s; a->num_bits -= s; @@ -4132,9 +4245,20 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z) int b,s; if (a->num_bits < 16) { if (stbi__zeof(a)) { - return -1; /* report error for unexpected end of data. */ + if (!a->hit_zeof_once) { + // This is the first time we hit eof, insert 16 extra padding btis + // to allow us to keep going; if we actually consume any of them + // though, that is invalid data. This is caught later. + a->hit_zeof_once = 1; + a->num_bits += 16; // add 16 implicit zero bits + } else { + // We already inserted our extra 16 padding bits and are again + // out, this stream is actually prematurely terminated. + return -1; + } + } else { + stbi__fill_bits(a); } - stbi__fill_bits(a); } b = z->fast[a->code_buffer & STBI__ZFAST_MASK]; if (b) { @@ -4199,17 +4323,25 @@ static int stbi__parse_huffman_block(stbi__zbuf *a) int len,dist; if (z == 256) { a->zout = zout; + if (a->hit_zeof_once && a->num_bits < 16) { + // The first time we hit zeof, we inserted 16 extra zero bits into our bit + // buffer so the decoder can just do its speculative decoding. But if we + // actually consumed any of those bits (which is the case when num_bits < 16), + // the stream actually read past the end so it is malformed. + return stbi__err("unexpected end","Corrupt PNG"); + } return 1; } + if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data z -= 257; len = stbi__zlength_base[z]; if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]); z = stbi__zhuffman_decode(a, &a->z_distance); - if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); + if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data dist = stbi__zdist_base[z]; if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]); if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG"); - if (zout + len > a->zout_end) { + if (len > a->zout_end - zout) { if (!stbi__zexpand(a, zout, len)) return 0; zout = a->zout; } @@ -4317,7 +4449,7 @@ static int stbi__parse_zlib_header(stbi__zbuf *a) return 1; } -static const stbi_uc stbi__zdefault_length[288] = +static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] = { 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, @@ -4353,6 +4485,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) if (!stbi__parse_zlib_header(a)) return 0; a->num_bits = 0; a->code_buffer = 0; + a->hit_zeof_once = 0; do { final = stbi__zreceive(a,1); type = stbi__zreceive(a,2); @@ -4363,7 +4496,7 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header) } else { if (type == 1) { // use fixed code lengths - if (!stbi__zbuild_huffman(&a->z_length , stbi__zdefault_length , 288)) return 0; + if (!stbi__zbuild_huffman(&a->z_length , stbi__zdefault_length , STBI__ZNSYMS)) return 0; if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance, 32)) return 0; } else { if (!stbi__compute_huffman_codes(a)) return 0; @@ -4508,9 +4641,8 @@ enum { STBI__F_up=2, STBI__F_avg=3, STBI__F_paeth=4, - // synthetic filters used for first scanline to avoid needing a dummy row of 0s - STBI__F_avg_first, - STBI__F_paeth_first + // synthetic filter used for first scanline to avoid needing a dummy row of 0s + STBI__F_avg_first }; static stbi_uc first_row_filter[5] = @@ -4519,29 +4651,56 @@ static stbi_uc first_row_filter[5] = STBI__F_sub, STBI__F_none, STBI__F_avg_first, - STBI__F_paeth_first + STBI__F_sub // Paeth with b=c=0 turns out to be equivalent to sub }; static int stbi__paeth(int a, int b, int c) { - int p = a + b - c; - int pa = abs(p-a); - int pb = abs(p-b); - int pc = abs(p-c); - if (pa <= pb && pa <= pc) return a; - if (pb <= pc) return b; - return c; + // This formulation looks very different from the reference in the PNG spec, but is + // actually equivalent and has favorable data dependencies and admits straightforward + // generation of branch-free code, which helps performance significantly. + int thresh = c*3 - (a + b); + int lo = a < b ? a : b; + int hi = a < b ? b : a; + int t0 = (hi <= thresh) ? lo : c; + int t1 = (thresh <= lo) ? hi : t0; + return t1; } static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 }; +// adds an extra all-255 alpha channel +// dest == src is legal +// img_n must be 1 or 3 +static void stbi__create_png_alpha_expand8(stbi_uc *dest, stbi_uc *src, stbi__uint32 x, int img_n) +{ + int i; + // must process data backwards since we allow dest==src + if (img_n == 1) { + for (i=x-1; i >= 0; --i) { + dest[i*2+1] = 255; + dest[i*2+0] = src[i]; + } + } else { + STBI_ASSERT(img_n == 3); + for (i=x-1; i >= 0; --i) { + dest[i*4+3] = 255; + dest[i*4+2] = src[i*3+2]; + dest[i*4+1] = src[i*3+1]; + dest[i*4+0] = src[i*3+0]; + } + } +} + // create the png data from post-deflated data static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color) { - int bytes = (depth == 16? 2 : 1); + int bytes = (depth == 16 ? 2 : 1); stbi__context *s = a->s; stbi__uint32 i,j,stride = x*out_n*bytes; stbi__uint32 img_len, img_width_bytes; + stbi_uc *filter_buf; + int all_ok = 1; int k; int img_n = s->img_n; // copy it into a local for later @@ -4553,8 +4712,11 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into if (!a->out) return stbi__err("outofmem", "Out of memory"); + // note: error exits here don't need to clean up a->out individually, + // stbi__do_png always does on error. if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG"); img_width_bytes = (((img_n * x * depth) + 7) >> 3); + if (!stbi__mad2sizes_valid(img_width_bytes, y, img_width_bytes)) return stbi__err("too large", "Corrupt PNG"); img_len = (img_width_bytes + 1) * y; // we used to check for exact match between raw_len and img_len on non-interlaced PNGs, @@ -4562,189 +4724,137 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r // so just check for raw_len < img_len always. if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG"); + // Allocate two scan lines worth of filter workspace buffer. + filter_buf = (stbi_uc *) stbi__malloc_mad2(img_width_bytes, 2, 0); + if (!filter_buf) return stbi__err("outofmem", "Out of memory"); + + // Filtering for low-bit-depth images + if (depth < 8) { + filter_bytes = 1; + width = img_width_bytes; + } + for (j=0; j < y; ++j) { - stbi_uc *cur = a->out + stride*j; - stbi_uc *prior; + // cur/prior filter buffers alternate + stbi_uc *cur = filter_buf + (j & 1)*img_width_bytes; + stbi_uc *prior = filter_buf + (~j & 1)*img_width_bytes; + stbi_uc *dest = a->out + stride*j; + int nk = width * filter_bytes; int filter = *raw++; - if (filter > 4) - return stbi__err("invalid filter","Corrupt PNG"); - - if (depth < 8) { - if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG"); - cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place - filter_bytes = 1; - width = img_width_bytes; + // check filter type + if (filter > 4) { + all_ok = stbi__err("invalid filter","Corrupt PNG"); + break; } - prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above // if first row, use special filter that doesn't sample previous row if (j == 0) filter = first_row_filter[filter]; - // handle first byte explicitly - for (k=0; k < filter_bytes; ++k) { - switch (filter) { - case STBI__F_none : cur[k] = raw[k]; break; - case STBI__F_sub : cur[k] = raw[k]; break; - case STBI__F_up : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break; - case STBI__F_avg : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break; - case STBI__F_paeth : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break; - case STBI__F_avg_first : cur[k] = raw[k]; break; - case STBI__F_paeth_first: cur[k] = raw[k]; break; - } - } - - if (depth == 8) { - if (img_n != out_n) - cur[img_n] = 255; // first pixel - raw += img_n; - cur += out_n; - prior += out_n; - } else if (depth == 16) { - if (img_n != out_n) { - cur[filter_bytes] = 255; // first pixel top byte - cur[filter_bytes+1] = 255; // first pixel bottom byte - } - raw += filter_bytes; - cur += output_bytes; - prior += output_bytes; - } else { - raw += 1; - cur += 1; - prior += 1; + // perform actual filtering + switch (filter) { + case STBI__F_none: + memcpy(cur, raw, nk); + break; + case STBI__F_sub: + memcpy(cur, raw, filter_bytes); + for (k = filter_bytes; k < nk; ++k) + cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); + break; + case STBI__F_up: + for (k = 0; k < nk; ++k) + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); + break; + case STBI__F_avg: + for (k = 0; k < filter_bytes; ++k) + cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); + for (k = filter_bytes; k < nk; ++k) + cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); + break; + case STBI__F_paeth: + for (k = 0; k < filter_bytes; ++k) + cur[k] = STBI__BYTECAST(raw[k] + prior[k]); // prior[k] == stbi__paeth(0,prior[k],0) + for (k = filter_bytes; k < nk; ++k) + cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes], prior[k], prior[k-filter_bytes])); + break; + case STBI__F_avg_first: + memcpy(cur, raw, filter_bytes); + for (k = filter_bytes; k < nk; ++k) + cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); + break; } - // this is a little gross, so that we don't switch per-pixel or per-component - if (depth < 8 || img_n == out_n) { - int nk = (width - 1)*filter_bytes; - #define STBI__CASE(f) \ - case f: \ - for (k=0; k < nk; ++k) - switch (filter) { - // "none" filter turns into a memcpy here; make that explicit. - case STBI__F_none: memcpy(cur, raw, nk); break; - STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break; - STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break; - STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break; - STBI__CASE(STBI__F_paeth) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break; - STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break; - STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break; - } - #undef STBI__CASE - raw += nk; - } else { - STBI_ASSERT(img_n+1 == out_n); - #define STBI__CASE(f) \ - case f: \ - for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \ - for (k=0; k < filter_bytes; ++k) - switch (filter) { - STBI__CASE(STBI__F_none) { cur[k] = raw[k]; } break; - STBI__CASE(STBI__F_sub) { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break; - STBI__CASE(STBI__F_up) { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break; - STBI__CASE(STBI__F_avg) { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break; - STBI__CASE(STBI__F_paeth) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break; - STBI__CASE(STBI__F_avg_first) { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break; - STBI__CASE(STBI__F_paeth_first) { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break; - } - #undef STBI__CASE - - // the loop above sets the high byte of the pixels' alpha, but for - // 16 bit png files we also need the low byte set. we'll do that here. - if (depth == 16) { - cur = a->out + stride*j; // start at the beginning of the row again - for (i=0; i < x; ++i,cur+=output_bytes) { - cur[filter_bytes+1] = 255; - } - } - } - } + raw += nk; - // we make a separate pass to expand bits to pixels; for performance, - // this could run two scanlines behind the above code, so it won't - // intefere with filtering but will still be in the cache. - if (depth < 8) { - for (j=0; j < y; ++j) { - stbi_uc *cur = a->out + stride*j; - stbi_uc *in = a->out + stride*j + x*out_n - img_width_bytes; - // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit - // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop + // expand decoded bits in cur to dest, also adding an extra alpha channel if desired + if (depth < 8) { stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range + stbi_uc *in = cur; + stbi_uc *out = dest; + stbi_uc inb = 0; + stbi__uint32 nsmp = x*img_n; - // note that the final byte might overshoot and write more data than desired. - // we can allocate enough data that this never writes out of memory, but it - // could also overwrite the next scanline. can it overwrite non-empty data - // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel. - // so we need to explicitly clamp the final ones - + // expand bits to bytes first if (depth == 4) { - for (k=x*img_n; k >= 2; k-=2, ++in) { - *cur++ = scale * ((*in >> 4) ); - *cur++ = scale * ((*in ) & 0x0f); + for (i=0; i < nsmp; ++i) { + if ((i & 1) == 0) inb = *in++; + *out++ = scale * (inb >> 4); + inb <<= 4; } - if (k > 0) *cur++ = scale * ((*in >> 4) ); } else if (depth == 2) { - for (k=x*img_n; k >= 4; k-=4, ++in) { - *cur++ = scale * ((*in >> 6) ); - *cur++ = scale * ((*in >> 4) & 0x03); - *cur++ = scale * ((*in >> 2) & 0x03); - *cur++ = scale * ((*in ) & 0x03); + for (i=0; i < nsmp; ++i) { + if ((i & 3) == 0) inb = *in++; + *out++ = scale * (inb >> 6); + inb <<= 2; } - if (k > 0) *cur++ = scale * ((*in >> 6) ); - if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03); - if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03); - } else if (depth == 1) { - for (k=x*img_n; k >= 8; k-=8, ++in) { - *cur++ = scale * ((*in >> 7) ); - *cur++ = scale * ((*in >> 6) & 0x01); - *cur++ = scale * ((*in >> 5) & 0x01); - *cur++ = scale * ((*in >> 4) & 0x01); - *cur++ = scale * ((*in >> 3) & 0x01); - *cur++ = scale * ((*in >> 2) & 0x01); - *cur++ = scale * ((*in >> 1) & 0x01); - *cur++ = scale * ((*in ) & 0x01); + } else { + STBI_ASSERT(depth == 1); + for (i=0; i < nsmp; ++i) { + if ((i & 7) == 0) inb = *in++; + *out++ = scale * (inb >> 7); + inb <<= 1; } - if (k > 0) *cur++ = scale * ((*in >> 7) ); - if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01); - if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01); - if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01); - if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01); - if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01); - if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01); } - if (img_n != out_n) { - int q; - // insert alpha = 255 - cur = a->out + stride*j; + + // insert alpha=255 values if desired + if (img_n != out_n) + stbi__create_png_alpha_expand8(dest, dest, x, img_n); + } else if (depth == 8) { + if (img_n == out_n) + memcpy(dest, cur, x*img_n); + else + stbi__create_png_alpha_expand8(dest, cur, x, img_n); + } else if (depth == 16) { + // convert the image data from big-endian to platform-native + stbi__uint16 *dest16 = (stbi__uint16*)dest; + stbi__uint32 nsmp = x*img_n; + + if (img_n == out_n) { + for (i = 0; i < nsmp; ++i, ++dest16, cur += 2) + *dest16 = (cur[0] << 8) | cur[1]; + } else { + STBI_ASSERT(img_n+1 == out_n); if (img_n == 1) { - for (q=x-1; q >= 0; --q) { - cur[q*2+1] = 255; - cur[q*2+0] = cur[q]; + for (i = 0; i < x; ++i, dest16 += 2, cur += 2) { + dest16[0] = (cur[0] << 8) | cur[1]; + dest16[1] = 0xffff; } } else { STBI_ASSERT(img_n == 3); - for (q=x-1; q >= 0; --q) { - cur[q*4+3] = 255; - cur[q*4+2] = cur[q*3+2]; - cur[q*4+1] = cur[q*3+1]; - cur[q*4+0] = cur[q*3+0]; + for (i = 0; i < x; ++i, dest16 += 4, cur += 6) { + dest16[0] = (cur[0] << 8) | cur[1]; + dest16[1] = (cur[2] << 8) | cur[3]; + dest16[2] = (cur[4] << 8) | cur[5]; + dest16[3] = 0xffff; } } } } - } else if (depth == 16) { - // force the image data from big-endian to platform-native. - // this is done in a separate pass due to the decoding relying - // on the data being untouched, but could probably be done - // per-line during decode if care is taken. - stbi_uc *cur = a->out; - stbi__uint16 *cur16 = (stbi__uint16*)cur; - - for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) { - *cur16 = (cur[0] << 8) | cur[1]; - } } + STBI_FREE(filter_buf); + if (!all_ok) return 0; + return 1; } @@ -4759,6 +4869,7 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3 // de-interlacing final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0); + if (!final) return stbi__err("outofmem", "Out of memory"); for (p=0; p < 7; ++p) { int xorig[] = { 0,4,0,2,0,1,0 }; int yorig[] = { 0,0,4,0,2,0,1 }; @@ -4879,19 +4990,46 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int return 1; } -static int stbi__unpremultiply_on_load = 0; -static int stbi__de_iphone_flag = 0; +static int stbi__unpremultiply_on_load_global = 0; +static int stbi__de_iphone_flag_global = 0; STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply) { - stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply; + stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply; } STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert) { - stbi__de_iphone_flag = flag_true_if_should_convert; + stbi__de_iphone_flag_global = flag_true_if_should_convert; } +#ifndef STBI_THREAD_LOCAL +#define stbi__unpremultiply_on_load stbi__unpremultiply_on_load_global +#define stbi__de_iphone_flag stbi__de_iphone_flag_global +#else +static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set; +static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set; + +STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply) +{ + stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply; + stbi__unpremultiply_on_load_set = 1; +} + +STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert) +{ + stbi__de_iphone_flag_local = flag_true_if_should_convert; + stbi__de_iphone_flag_set = 1; +} + +#define stbi__unpremultiply_on_load (stbi__unpremultiply_on_load_set \ + ? stbi__unpremultiply_on_load_local \ + : stbi__unpremultiply_on_load_global) +#define stbi__de_iphone_flag (stbi__de_iphone_flag_set \ + ? stbi__de_iphone_flag_local \ + : stbi__de_iphone_flag_global) +#endif // STBI_THREAD_LOCAL + static void stbi__de_iphone(stbi__png *z) { stbi__context *s = z->s; @@ -4981,14 +5119,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) if (!pal_img_n) { s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0); if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode"); - if (scan == STBI__SCAN_header) return 1; } else { // if paletted, then pal_n is our final components, and // img_n is # components to decompress/filter. s->img_n = 1; if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG"); - // if SCAN_header, have to scan to see if we have a tRNS } + // even with SCAN_header, have to scan to see if we have a tRNS break; } @@ -5020,10 +5157,14 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG"); if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG"); has_trans = 1; + // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now. + if (scan == STBI__SCAN_header) { ++s->img_n; return 1; } if (z->depth == 16) { - for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is + for (k = 0; k < s->img_n && k < 3; ++k) // extra loop test to suppress false GCC warning + tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is } else { - for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger + for (k = 0; k < s->img_n && k < 3; ++k) + tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger } } break; @@ -5032,7 +5173,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp) case STBI__PNG_TYPE('I','D','A','T'): { if (first) return stbi__err("first not IHDR", "Corrupt PNG"); if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG"); - if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; } + if (scan == STBI__SCAN_header) { + // header scan definitely stops at first IDAT + if (pal_img_n) + s->img_n = pal_img_n; + return 1; + } + if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes"); if ((int)(ioff + c.length) < (int)ioff) return 0; if (ioff + c.length > idata_limit) { stbi__uint32 idata_limit_old = idata_limit; @@ -5272,6 +5419,32 @@ typedef struct int extra_read; } stbi__bmp_data; +static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress) +{ + // BI_BITFIELDS specifies masks explicitly, don't override + if (compress == 3) + return 1; + + if (compress == 0) { + if (info->bpp == 16) { + info->mr = 31u << 10; + info->mg = 31u << 5; + info->mb = 31u << 0; + } else if (info->bpp == 32) { + info->mr = 0xffu << 16; + info->mg = 0xffu << 8; + info->mb = 0xffu << 0; + info->ma = 0xffu << 24; + info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 + } else { + // otherwise, use defaults, which is all-0 + info->mr = info->mg = info->mb = info->ma = 0; + } + return 1; + } + return 0; // error +} + static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) { int hsz; @@ -5299,6 +5472,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) if (hsz != 12) { int compress = stbi__get32le(s); if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE"); + if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes + if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel stbi__get32le(s); // discard sizeof stbi__get32le(s); // discard hres stbi__get32le(s); // discard vres @@ -5313,17 +5488,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) } if (info->bpp == 16 || info->bpp == 32) { if (compress == 0) { - if (info->bpp == 32) { - info->mr = 0xffu << 16; - info->mg = 0xffu << 8; - info->mb = 0xffu << 0; - info->ma = 0xffu << 24; - info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0 - } else { - info->mr = 31u << 10; - info->mg = 31u << 5; - info->mb = 31u << 0; - } + stbi__bmp_set_mask_defaults(info, compress); } else if (compress == 3) { info->mr = stbi__get32le(s); info->mg = stbi__get32le(s); @@ -5338,6 +5503,7 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) return stbi__errpuc("bad BMP", "bad BMP"); } } else { + // V4/V5 header int i; if (hsz != 108 && hsz != 124) return stbi__errpuc("bad BMP", "bad BMP"); @@ -5345,6 +5511,8 @@ static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info) info->mg = stbi__get32le(s); info->mb = stbi__get32le(s); info->ma = stbi__get32le(s); + if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs + stbi__bmp_set_mask_defaults(info, compress); stbi__get32le(s); // discard color space for (i=0; i < 12; ++i) stbi__get32le(s); // discard color space parameters @@ -5394,9 +5562,22 @@ static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req psize = (info.offset - info.extra_read - info.hsz) >> 2; } if (psize == 0) { - STBI_ASSERT(info.offset == s->callback_already_read + (int) (s->img_buffer - s->img_buffer_original)); - if (info.offset != s->callback_already_read + (s->img_buffer - s->buffer_start)) { - return stbi__errpuc("bad offset", "Corrupt BMP"); + // accept some number of extra bytes after the header, but if the offset points either to before + // the header ends or implies a large amount of extra data, reject the file as malformed + int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original); + int header_limit = 1024; // max we actually read is below 256 bytes currently. + int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size. + if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) { + return stbi__errpuc("bad header", "Corrupt BMP"); + } + // we established that bytes_read_so_far is positive and sensible. + // the first half of this test rejects offsets that are either too small positives, or + // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn + // ensures the number computed in the second half of the test can't overflow. + if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) { + return stbi__errpuc("bad offset", "Corrupt BMP"); + } else { + stbi__skip(s, info.offset - bytes_read_so_far); } } @@ -6342,6 +6523,7 @@ static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_c // intermediate buffer is RGBA result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0); + if (!result) return stbi__errpuc("outofmem", "Out of memory"); memset(result, 0xff, x*y*4); if (!stbi__pic_load_core(s,x,y,comp, result)) { @@ -6457,6 +6639,7 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp) { stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif)); + if (!g) return stbi__err("outofmem", "Out of memory"); if (!stbi__gif_header(s, g, comp, 1)) { STBI_FREE(g); stbi__rewind( s ); @@ -6766,6 +6949,17 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i } } +static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays) +{ + STBI_FREE(g->out); + STBI_FREE(g->history); + STBI_FREE(g->background); + + if (out) STBI_FREE(out); + if (delays && *delays) STBI_FREE(*delays); + return stbi__errpuc("outofmem", "Out of memory"); +} + static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp) { if (stbi__gif_test(s)) { @@ -6777,6 +6971,10 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int stride; int out_size = 0; int delays_size = 0; + + STBI_NOTUSED(out_size); + STBI_NOTUSED(delays_size); + memset(&g, 0, sizeof(g)); if (delays) { *delays = 0; @@ -6794,26 +6992,29 @@ static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, if (out) { void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride ); - if (NULL == tmp) { - STBI_FREE(g.out); - STBI_FREE(g.history); - STBI_FREE(g.background); - return stbi__errpuc("outofmem", "Out of memory"); - } + if (!tmp) + return stbi__load_gif_main_outofmem(&g, out, delays); else { out = (stbi_uc*) tmp; out_size = layers * stride; } if (delays) { - *delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers ); + int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers ); + if (!new_delays) + return stbi__load_gif_main_outofmem(&g, out, delays); + *delays = new_delays; delays_size = layers * sizeof(int); } } else { out = (stbi_uc*)stbi__malloc( layers * stride ); + if (!out) + return stbi__load_gif_main_outofmem(&g, out, delays); out_size = layers * stride; if (delays) { *delays = (int*) stbi__malloc( layers * sizeof(int) ); + if (!*delays) + return stbi__load_gif_main_outofmem(&g, out, delays); delays_size = layers * sizeof(int); } } @@ -7064,12 +7265,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re // Run value = stbi__get8(s); count -= 128; - if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } + if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } for (z = 0; z < count; ++z) scanline[i++ * 4 + k] = value; } else { // Dump - if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } + if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); } for (z = 0; z < count; ++z) scanline[i++ * 4 + k] = stbi__get8(s); } @@ -7138,9 +7339,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp) info.all_a = 255; p = stbi__bmp_parse_header(s, &info); - stbi__rewind( s ); - if (p == NULL) + if (p == NULL) { + stbi__rewind( s ); return 0; + } if (x) *x = s->img_x; if (y) *y = s->img_y; if (comp) { @@ -7206,8 +7408,8 @@ static int stbi__psd_is16(stbi__context *s) stbi__rewind( s ); return 0; } - (void) stbi__get32be(s); - (void) stbi__get32be(s); + STBI_NOTUSED(stbi__get32be(s)); + STBI_NOTUSED(stbi__get32be(s)); depth = stbi__get16be(s); if (depth != 16) { stbi__rewind( s ); @@ -7286,7 +7488,6 @@ static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp) // Known limitations: // Does not support comments in the header section // Does not support ASCII image data (formats P2 and P3) -// Does not support 16-bit-per-channel #ifndef STBI_NO_PNM @@ -7307,7 +7508,8 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req stbi_uc *out; STBI_NOTUSED(ri); - if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n)) + ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n); + if (ri->bits_per_channel == 0) return 0; if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)"); @@ -7317,15 +7519,22 @@ static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req *y = s->img_y; if (comp) *comp = s->img_n; - if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0)) + if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0)) return stbi__errpuc("too large", "PNM too large"); - out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0); + out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0); if (!out) return stbi__errpuc("outofmem", "Out of memory"); - stbi__getn(s, out, s->img_n * s->img_x * s->img_y); + if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) { + STBI_FREE(out); + return stbi__errpuc("bad PNM", "PNM file truncated"); + } if (req_comp && req_comp != s->img_n) { - out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); + if (ri->bits_per_channel == 16) { + out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y); + } else { + out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y); + } if (out == NULL) return out; // stbi__convert_format frees input on failure } return out; @@ -7362,6 +7571,8 @@ static int stbi__pnm_getinteger(stbi__context *s, char *c) while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) { value = value*10 + (*c - '0'); *c = (char) stbi__get8(s); + if((value > 214748364) || (value == 214748364 && *c > '7')) + return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int"); } return value; @@ -7392,17 +7603,29 @@ static int stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp) stbi__pnm_skip_whitespace(s, &c); *x = stbi__pnm_getinteger(s, &c); // read width + if(*x == 0) + return stbi__err("invalid width", "PPM image header had zero or overflowing width"); stbi__pnm_skip_whitespace(s, &c); *y = stbi__pnm_getinteger(s, &c); // read height + if (*y == 0) + return stbi__err("invalid width", "PPM image header had zero or overflowing width"); stbi__pnm_skip_whitespace(s, &c); maxv = stbi__pnm_getinteger(s, &c); // read max value - - if (maxv > 255) - return stbi__err("max value > 255", "PPM image not 8-bit"); + if (maxv > 65535) + return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images"); + else if (maxv > 255) + return 16; else - return 1; + return 8; +} + +static int stbi__pnm_is16(stbi__context *s) +{ + if (stbi__pnm_info(s, NULL, NULL, NULL) == 16) + return 1; + return 0; } #endif @@ -7458,6 +7681,9 @@ static int stbi__is_16_main(stbi__context *s) if (stbi__psd_is16(s)) return 1; #endif + #ifndef STBI_NO_PNM + if (stbi__pnm_is16(s)) return 1; + #endif return 0; } diff --git a/third_party/stb/stb_image_resize2.h b/third_party/stb/stb_image_resize2.h new file mode 100644 index 0000000..6146ab7 --- /dev/null +++ b/third_party/stb/stb_image_resize2.h @@ -0,0 +1,10651 @@ +/* stb_image_resize2 - v2.17 - public domain image resizing + + by Jeff Roberts (v2) and Jorge L Rodriguez + http://github.com/nothings/stb + + Can be threaded with the extended API. SSE2, AVX, Neon and WASM SIMD support. Only + scaling and translation is supported, no rotations or shears. + + COMPILING & LINKING + In one C/C++ file that #includes this file, do this: + #define STB_IMAGE_RESIZE_IMPLEMENTATION + before the #include. That will create the implementation in that file. + + EASY API CALLS: + Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation, clamps to edge. + + stbir_resize_uint8_srgb( input_pixels, input_w, input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout_enum ) + + stbir_resize_uint8_linear( input_pixels, input_w, input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout_enum ) + + stbir_resize_float_linear( input_pixels, input_w, input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout_enum ) + + If you pass NULL or zero for the output_pixels, we will allocate the output buffer + for you and return it from the function (free with free() or STBIR_FREE). + As a special case, XX_stride_in_bytes of 0 means packed continuously in memory. + + API LEVELS + There are three levels of API - easy-to-use, medium-complexity and extended-complexity. + + See the "header file" section of the source for API documentation. + + ADDITIONAL DOCUMENTATION + + MEMORY ALLOCATION + By default, we use malloc and free for memory allocation. To override the + memory allocation, before the implementation #include, add a: + + #define STBIR_MALLOC(size,user_data) ... + #define STBIR_FREE(ptr,user_data) ... + + Each resize makes exactly one call to malloc/free (unless you use the + extended API where you can do one allocation for many resizes). Under + address sanitizer, we do separate allocations to find overread/writes. + + PERFORMANCE + This library was written with an emphasis on performance. When testing + stb_image_resize with RGBA, the fastest mode is STBIR_4CHANNEL with + STBIR_TYPE_UINT8 pixels and CLAMPed edges (which is what many other resize + libs do by default). Also, make sure SIMD is turned on of course (default + for 64-bit targets). Avoid WRAP edge mode if you want the fastest speed. + + This library also comes with profiling built-in. If you define STBIR_PROFILE, + you can use the advanced API and get low-level profiling information by + calling stbir_resize_extended_profile_info() or stbir_resize_split_profile_info() + after a resize. + + SIMD + Most of the routines have optimized SSE2, AVX, NEON and WASM versions. + + On Microsoft compilers, we automatically turn on SIMD for 64-bit x64 and + ARM; for 32-bit x86 and ARM, you select SIMD mode by defining STBIR_SSE2 or + STBIR_NEON. For AVX and AVX2, we auto-select it by detecting the /arch:AVX + or /arch:AVX2 switches. You can also always manually turn SSE2, AVX or AVX2 + support on by defining STBIR_SSE2, STBIR_AVX or STBIR_AVX2. + + On Linux, SSE2 and Neon is on by default for 64-bit x64 or ARM64. For 32-bit, + we select x86 SIMD mode by whether you have -msse2, -mavx or -mavx2 enabled + on the command line. For 32-bit ARM, you must pass -mfpu=neon-vfpv4 for both + clang and GCC, but GCC also requires an additional -mfp16-format=ieee to + automatically enable NEON. + + On x86 platforms, you can also define STBIR_FP16C to turn on FP16C instructions + for converting back and forth to half-floats. This is autoselected when we + are using AVX2. Clang and GCC also require the -mf16c switch. ARM always uses + the built-in half float hardware NEON instructions. + + You can also tell us to use multiply-add instructions with STBIR_USE_FMA. + Because x86 doesn't always have fma, we turn it off by default to maintain + determinism across all platforms. If you don't care about non-FMA determinism + and are willing to restrict yourself to more recent x86 CPUs (around the AVX + timeframe), then fma will give you around a 15% speedup. + + You can force off SIMD in all cases by defining STBIR_NO_SIMD. You can turn + off AVX or AVX2 specifically with STBIR_NO_AVX or STBIR_NO_AVX2. AVX is 10% + to 40% faster, and AVX2 is generally another 12%. + + ALPHA CHANNEL + Most of the resizing functions provide the ability to control how the alpha + channel of an image is processed. + + When alpha represents transparency, it is important that when combining + colors with filtering, the pixels should not be treated equally; they + should use a weighted average based on their alpha values. For example, + if a pixel is 1% opaque bright green and another pixel is 99% opaque + black and you average them, the average will be 50% opaque, but the + unweighted average and will be a middling green color, while the weighted + average will be nearly black. This means the unweighted version introduced + green energy that didn't exist in the source image. + + (If you want to know why this makes sense, you can work out the math for + the following: consider what happens if you alpha composite a source image + over a fixed color and then average the output, vs. if you average the + source image pixels and then composite that over the same fixed color. + Only the weighted average produces the same result as the ground truth + composite-then-average result.) + + Therefore, it is in general best to "alpha weight" the pixels when applying + filters to them. This essentially means multiplying the colors by the alpha + values before combining them, and then dividing by the alpha value at the + end. + + The computer graphics industry introduced a technique called "premultiplied + alpha" or "associated alpha" in which image colors are stored in image files + already multiplied by their alpha. This saves some math when compositing, + and also avoids the need to divide by the alpha at the end (which is quite + inefficient). However, while premultiplied alpha is common in the movie CGI + industry, it is not commonplace in other industries like videogames, and most + consumer file formats are generally expected to contain not-premultiplied + colors. For example, Photoshop saves PNG files "unpremultiplied", and web + browsers like Chrome and Firefox expect PNG images to be unpremultiplied. + + Note that there are three possibilities that might describe your image + and resize expectation: + + 1. images are not premultiplied, alpha weighting is desired + 2. images are not premultiplied, alpha weighting is not desired + 3. images are premultiplied + + Both case #2 and case #3 require the exact same math: no alpha weighting + should be applied or removed. Only case 1 requires extra math operations; + the other two cases can be handled identically. + + stb_image_resize expects case #1 by default, applying alpha weighting to + images, expecting the input images to be unpremultiplied. This is what the + COLOR+ALPHA buffer types tell the resizer to do. + + When you use the pixel layouts STBIR_RGBA, STBIR_BGRA, STBIR_ARGB, + STBIR_ABGR, STBIR_RX, or STBIR_XR you are telling us that the pixels are + non-premultiplied. In these cases, the resizer will alpha weight the colors + (effectively creating the premultiplied image), do the filtering, and then + convert back to non-premult on exit. + + When you use the pixel layouts STBIR_RGBA_PM, STBIR_RGBA_PM, STBIR_RGBA_PM, + STBIR_RGBA_PM, STBIR_RX_PM or STBIR_XR_PM, you are telling that the pixels + ARE premultiplied. In this case, the resizer doesn't have to do the + premultipling - it can filter directly on the input. This about twice as + fast as the non-premultiplied case, so it's the right option if your data is + already setup correctly. + + When you use the pixel layout STBIR_4CHANNEL or STBIR_2CHANNEL, you are + telling us that there is no channel that represents transparency; it may be + RGB and some unrelated fourth channel that has been stored in the alpha + channel, but it is actually not alpha. No special processing will be + performed. + + The difference between the generic 4 or 2 channel layouts, and the + specialized _PM versions is with the _PM versions you are telling us that + the data *is* alpha, just don't premultiply it. That's important when + using SRGB pixel formats, we need to know where the alpha is, because + it is converted linearly (rather than with the SRGB converters). + + Because alpha weighting produces the same effect as premultiplying, you + even have the option with non-premultiplied inputs to let the resizer + produce a premultiplied output. Because the intially computed alpha-weighted + output image is effectively premultiplied, this is actually more performant + than the normal path which un-premultiplies the output image as a final step. + + Finally, when converting both in and out of non-premulitplied space (for + example, when using STBIR_RGBA), we go to somewhat heroic measures to + ensure that areas with zero alpha value pixels get something reasonable + in the RGB values. If you don't care about the RGB values of zero alpha + pixels, you can call the stbir_set_non_pm_alpha_speed_over_quality() + function - this runs a premultiplied resize about 25% faster. That said, + when you really care about speed, using premultiplied pixels for both in + and out (STBIR_RGBA_PM, etc) much faster than both of these premultiplied + options. + + PIXEL LAYOUT CONVERSION + The resizer can convert from some pixel layouts to others. When using the + stbir_set_pixel_layouts(), you can, for example, specify STBIR_RGBA + on input, and STBIR_ARGB on output, and it will re-organize the channels + during the resize. Currently, you can only convert between two pixel + layouts with the same number of channels. + + DETERMINISM + We commit to being deterministic (from x64 to ARM to scalar to SIMD, etc). + This requires compiling with fast-math off (using at least /fp:precise). + Also, you must turn off fp-contracting (which turns mult+adds into fmas)! + We attempt to do this with pragmas, but with Clang, you usually want to add + -ffp-contract=off to the command line as well. + + For 32-bit x86, you must use SSE and SSE2 codegen for determinism. That is, + if the scalar x87 unit gets used at all, we immediately lose determinism. + On Microsoft Visual Studio 2008 and earlier, from what we can tell there is + no way to be deterministic in 32-bit x86 (some x87 always leaks in, even + with fp:strict). On 32-bit x86 GCC, determinism requires both -msse2 and + -fpmath=sse. + + Note that we will not be deterministic with float data containing NaNs - + the NaNs will propagate differently on different SIMD and platforms. + + If you turn on STBIR_USE_FMA, then we will be deterministic with other + fma targets, but we will differ from non-fma targets (this is unavoidable, + because a fma isn't simply an add with a mult - it also introduces a + rounding difference compared to non-fma instruction sequences. + + FLOAT PIXEL FORMAT RANGE + Any range of values can be used for the non-alpha float data that you pass + in (0 to 1, -1 to 1, whatever). However, if you are inputting float values + but *outputting* bytes or shorts, you must use a range of 0 to 1 so that we + scale back properly. The alpha channel must also be 0 to 1 for any format + that does premultiplication prior to resizing. + + Note also that with float output, using filters with negative lobes, the + output filtered values might go slightly out of range. You can define + STBIR_FLOAT_LOW_CLAMP and/or STBIR_FLOAT_HIGH_CLAMP to specify the range + to clamp to on output, if that's important. + + MAX/MIN SCALE FACTORS + The input pixel resolutions are in integers, and we do the internal pointer + resolution in size_t sized integers. However, the scale ratio from input + resolution to output resolution is calculated in float form. This means + the effective possible scale ratio is limited to 24 bits (or 16 million + to 1). As you get close to the size of the float resolution (again, 16 + million pixels wide or high), you might start seeing float inaccuracy + issues in general in the pipeline. If you have to do extreme resizes, + you can usually do this is multiple stages (using float intermediate + buffers). + + FLIPPED IMAGES + Stride is just the delta from one scanline to the next. This means you can + use a negative stride to handle inverted images (point to the final + scanline and use a negative stride). You can invert the input or output, + using negative strides. + + DEFAULT FILTERS + For functions which don't provide explicit control over what filters to + use, you can change the compile-time defaults with: + + #define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_something + #define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_something + + See stbir_filter in the header-file section for the list of filters. + + NEW FILTERS + A number of 1D filter kernels are supplied. For a list of supported + filters, see the stbir_filter enum. You can install your own filters by + using the stbir_set_filter_callbacks function. + + PROGRESS + For interactive use with slow resize operations, you can use the + scanline callbacks in the extended API. It would have to be a *very* large + image resample to need progress though - we're very fast. + + CEIL and FLOOR + In scalar mode, the only functions we use from math.h are ceilf and floorf, + but if you have your own versions, you can define the STBIR_CEILF(v) and + STBIR_FLOORF(v) macros and we'll use them instead. In SIMD, we just use + our own versions. + + ASSERT + Define STBIR_ASSERT(boolval) to override assert() and not use assert.h + + PORTING FROM VERSION 1 + The API has changed. You can continue to use the old version of stb_image_resize.h, + which is available in the "deprecated/" directory. + + If you're using the old simple-to-use API, porting is straightforward. + (For more advanced APIs, read the documentation.) + + stbir_resize_uint8(): + - call `stbir_resize_uint8_linear`, cast channel count to `stbir_pixel_layout` + + stbir_resize_float(): + - call `stbir_resize_float_linear`, cast channel count to `stbir_pixel_layout` + + stbir_resize_uint8_srgb(): + - function name is unchanged + - cast channel count to `stbir_pixel_layout` + - above is sufficient unless your image has alpha and it's not RGBA/BGRA + - in that case, follow the below instructions for stbir_resize_uint8_srgb_edgemode + + stbir_resize_uint8_srgb_edgemode() + - switch to the "medium complexity" API + - stbir_resize(), very similar API but a few more parameters: + - pixel_layout: cast channel count to `stbir_pixel_layout` + - data_type: STBIR_TYPE_UINT8_SRGB + - edge: unchanged (STBIR_EDGE_WRAP, etc.) + - filter: STBIR_FILTER_DEFAULT + - which channel is alpha is specified in stbir_pixel_layout, see enum for details + + FUTURE TODOS + * For polyphase integral filters, we just memcpy the coeffs to dupe + them, but we should indirect and use the same coeff memory. + * Add pixel layout conversions for sensible different channel counts + (maybe, 1->3/4, 3->4, 4->1, 3->1). + * For SIMD encode and decode scanline routines, do any pre-aligning + for bad input/output buffer alignments and pitch? + * For very wide scanlines, we should we do vertical strips to stay within + L2 cache. Maybe do chunks of 1K pixels at a time. There would be + some pixel reconversion, but probably dwarfed by things falling out + of cache. Probably also something possible with alternating between + scattering and gathering at high resize scales? + * Should we have a multiple MIPs at the same time function (could keep + more memory in cache during multiple resizes)? + * Rewrite the coefficient generator to do many at once. + * AVX-512 vertical kernels - worried about downclocking here. + * Convert the reincludes to macros when we know they aren't changing. + * Experiment with pivoting the horizontal and always using the + vertical filters (which are faster, but perhaps not enough to overcome + the pivot cost and the extra memory touches). Need to buffer the whole + image so have to balance memory use. + * Most of our code is internally function pointers, should we compile + all the SIMD stuff always and dynamically dispatch? + + CONTRIBUTORS + Jeff Roberts: 2.0 implementation, optimizations, SIMD + Martins Mozeiko: NEON simd, WASM simd, clang and GCC whisperer + Fabian Giesen: half float and srgb converters + Sean Barrett: API design, optimizations + Jorge L Rodriguez: Original 1.0 implementation + Aras Pranckevicius: bugfixes + Nathan Reed: warning fixes for 1.0 + + REVISIONS + 2.17 (2025-10-25) silly format bug in easy-to-use APIs. + 2.16 (2025-10-21) fixed the easy-to-use APIs to allow inverted bitmaps (negative + strides), fix vertical filter kernel callback, fix threaded + gather buffer priming (and assert). + (thanks adipose, TainZerL, and Harrison Green) + 2.15 (2025-07-17) fixed an assert in debug mode when using floats with input + callbacks, work around GCC warning when adding to null ptr + (thanks Johannes Spohr and Pyry Kovanen). + 2.14 (2025-05-09) fixed a bug using downsampling gather horizontal first, and + scatter with vertical first. + 2.13 (2025-02-27) fixed a bug when using input callbacks, turned off simd for + tiny-c, fixed some variables that should have been static, + fixes a bug when calculating temp memory with resizes that + exceed 2GB of temp memory (very large resizes). + 2.12 (2024-10-18) fix incorrect use of user_data with STBIR_FREE + 2.11 (2024-09-08) fix harmless asan warnings in 2-channel and 3-channel mode + with AVX-2, fix some weird scaling edge conditions with + point sample mode. + 2.10 (2024-07-27) fix the defines GCC and mingw for loop unroll control, + fix MSVC 32-bit arm half float routines. + 2.09 (2024-06-19) fix the defines for 32-bit ARM GCC builds (was selecting + hardware half floats). + 2.08 (2024-06-10) fix for RGB->BGR three channel flips and add SIMD (thanks + to Ryan Salsbury), fix for sub-rect resizes, use the + pragmas to control unrolling when they are available. + 2.07 (2024-05-24) fix for slow final split during threaded conversions of very + wide scanlines when downsampling (caused by extra input + converting), fix for wide scanline resamples with many + splits (int overflow), fix GCC warning. + 2.06 (2024-02-10) fix for identical width/height 3x or more down-scaling + undersampling a single row on rare resize ratios (about 1%). + 2.05 (2024-02-07) fix for 2 pixel to 1 pixel resizes with wrap (thanks Aras), + fix for output callback (thanks Julien Koenen). + 2.04 (2023-11-17) fix for rare AVX bug, shadowed symbol (thanks Nikola Smiljanic). + 2.03 (2023-11-01) ASAN and TSAN warnings fixed, minor tweaks. + 2.00 (2023-10-10) mostly new source: new api, optimizations, simd, vertical-first, etc + 2x-5x faster without simd, 4x-12x faster with simd, + in some cases, 20x to 40x faster esp resizing large to very small. + 0.96 (2019-03-04) fixed warnings + 0.95 (2017-07-23) fixed warnings + 0.94 (2017-03-18) fixed warnings + 0.93 (2017-03-03) fixed bug with certain combinations of heights + 0.92 (2017-01-02) fix integer overflow on large (>2GB) images + 0.91 (2016-04-02) fix warnings; fix handling of subpixel regions + 0.90 (2014-09-17) first released version + + LICENSE + See end of file for license information. +*/ + +#if !defined(STB_IMAGE_RESIZE_DO_HORIZONTALS) && !defined(STB_IMAGE_RESIZE_DO_VERTICALS) && !defined(STB_IMAGE_RESIZE_DO_CODERS) // for internal re-includes + +#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE2_H +#define STBIR_INCLUDE_STB_IMAGE_RESIZE2_H + +#include +#ifdef _MSC_VER +typedef unsigned char stbir_uint8; +typedef unsigned short stbir_uint16; +typedef unsigned int stbir_uint32; +typedef unsigned __int64 stbir_uint64; +#else +#include +typedef uint8_t stbir_uint8; +typedef uint16_t stbir_uint16; +typedef uint32_t stbir_uint32; +typedef uint64_t stbir_uint64; +#endif + +#ifndef STBIRDEF +#ifdef STB_IMAGE_RESIZE_STATIC +#define STBIRDEF static +#else +#ifdef __cplusplus +#define STBIRDEF extern "C" +#else +#define STBIRDEF extern +#endif +#endif +#endif + +////////////////////////////////////////////////////////////////////////////// +//// start "header file" /////////////////////////////////////////////////// +// +// Easy-to-use API: +// +// * stride is the offset between successive rows of image data +// in memory, in bytes. specify 0 for packed continuously in memory +// * colorspace is linear or sRGB as specified by function name +// * Uses the default filters +// * Uses edge mode clamped +// * returned result is 1 for success or 0 in case of an error. + + +// stbir_pixel_layout specifies: +// number of channels +// order of channels +// whether color is premultiplied by alpha +// for back compatibility, you can cast the old channel count to an stbir_pixel_layout +typedef enum +{ + STBIR_1CHANNEL = 1, + STBIR_2CHANNEL = 2, + STBIR_RGB = 3, // 3-chan, with order specified (for channel flipping) + STBIR_BGR = 0, // 3-chan, with order specified (for channel flipping) + STBIR_4CHANNEL = 5, + + STBIR_RGBA = 4, // alpha formats, where alpha is NOT premultiplied into color channels + STBIR_BGRA = 6, + STBIR_ARGB = 7, + STBIR_ABGR = 8, + STBIR_RA = 9, + STBIR_AR = 10, + + STBIR_RGBA_PM = 11, // alpha formats, where alpha is premultiplied into color channels + STBIR_BGRA_PM = 12, + STBIR_ARGB_PM = 13, + STBIR_ABGR_PM = 14, + STBIR_RA_PM = 15, + STBIR_AR_PM = 16, + + STBIR_RGBA_NO_AW = 11, // alpha formats, where NO alpha weighting is applied at all! + STBIR_BGRA_NO_AW = 12, // these are just synonyms for the _PM flags (which also do + STBIR_ARGB_NO_AW = 13, // no alpha weighting). These names just make it more clear + STBIR_ABGR_NO_AW = 14, // for some folks). + STBIR_RA_NO_AW = 15, + STBIR_AR_NO_AW = 16, + +} stbir_pixel_layout; + +//=============================================================== +// Simple-complexity API +// +// If output_pixels is NULL (0), then we will allocate the buffer and return it to you. +//-------------------------------- + +STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_type ); + +STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_type ); + +STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + float *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_type ); +//=============================================================== + +//=============================================================== +// Medium-complexity API +// +// This extends the easy-to-use API as follows: +// +// * Can specify the datatype - U8, U8_SRGB, U16, FLOAT, HALF_FLOAT +// * Edge wrap can selected explicitly +// * Filter can be selected explicitly +//-------------------------------- + +typedef enum +{ + STBIR_EDGE_CLAMP = 0, + STBIR_EDGE_REFLECT = 1, + STBIR_EDGE_WRAP = 2, // this edge mode is slower and uses more memory + STBIR_EDGE_ZERO = 3, +} stbir_edge; + +typedef enum +{ + STBIR_FILTER_DEFAULT = 0, // use same filter type that easy-to-use API chooses + STBIR_FILTER_BOX = 1, // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios + STBIR_FILTER_TRIANGLE = 2, // On upsampling, produces same results as bilinear texture filtering + STBIR_FILTER_CUBICBSPLINE = 3, // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque + STBIR_FILTER_CATMULLROM = 4, // An interpolating cubic spline + STBIR_FILTER_MITCHELL = 5, // Mitchell-Netrevalli filter with B=1/3, C=1/3 + STBIR_FILTER_POINT_SAMPLE = 6, // Simple point sampling + STBIR_FILTER_OTHER = 7, // User callback specified +} stbir_filter; + +typedef enum +{ + STBIR_TYPE_UINT8 = 0, + STBIR_TYPE_UINT8_SRGB = 1, + STBIR_TYPE_UINT8_SRGB_ALPHA = 2, // alpha channel, when present, should also be SRGB (this is very unusual) + STBIR_TYPE_UINT16 = 3, + STBIR_TYPE_FLOAT = 4, + STBIR_TYPE_HALF_FLOAT = 5 +} stbir_datatype; + +// medium api +STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout, stbir_datatype data_type, + stbir_edge edge, stbir_filter filter ); +//=============================================================== + + + +//=============================================================== +// Extended-complexity API +// +// This API exposes all resize functionality. +// +// * Separate filter types for each axis +// * Separate edge modes for each axis +// * Separate input and output data types +// * Can specify regions with subpixel correctness +// * Can specify alpha flags +// * Can specify a memory callback +// * Can specify a callback data type for pixel input and output +// * Can be threaded for a single resize +// * Can be used to resize many frames without recalculating the sampler info +// +// Use this API as follows: +// 1) Call the stbir_resize_init function on a local STBIR_RESIZE structure +// 2) Call any of the stbir_set functions +// 3) Optionally call stbir_build_samplers() if you are going to resample multiple times +// with the same input and output dimensions (like resizing video frames) +// 4) Resample by calling stbir_resize_extended(). +// 5) Call stbir_free_samplers() if you called stbir_build_samplers() +//-------------------------------- + + +// Types: + +// INPUT CALLBACK: this callback is used for input scanlines +typedef void const * stbir_input_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context ); + +// OUTPUT CALLBACK: this callback is used for output scanlines +typedef void stbir_output_callback( void const * output_ptr, int num_pixels, int y, void * context ); + +// callbacks for user installed filters +typedef float stbir__kernel_callback( float x, float scale, void * user_data ); // centered at zero +typedef float stbir__support_callback( float scale, void * user_data ); + +// internal structure with precomputed scaling +typedef struct stbir__info stbir__info; + +typedef struct STBIR_RESIZE // use the stbir_resize_init and stbir_override functions to set these values for future compatibility +{ + void * user_data; + void const * input_pixels; + int input_w, input_h; + double input_s0, input_t0, input_s1, input_t1; + stbir_input_callback * input_cb; + void * output_pixels; + int output_w, output_h; + int output_subx, output_suby, output_subw, output_subh; + stbir_output_callback * output_cb; + int input_stride_in_bytes; + int output_stride_in_bytes; + int splits; + int fast_alpha; + int needs_rebuild; + int called_alloc; + stbir_pixel_layout input_pixel_layout_public; + stbir_pixel_layout output_pixel_layout_public; + stbir_datatype input_data_type; + stbir_datatype output_data_type; + stbir_filter horizontal_filter, vertical_filter; + stbir_edge horizontal_edge, vertical_edge; + stbir__kernel_callback * horizontal_filter_kernel; stbir__support_callback * horizontal_filter_support; + stbir__kernel_callback * vertical_filter_kernel; stbir__support_callback * vertical_filter_support; + stbir__info * samplers; +} STBIR_RESIZE; + +// extended complexity api + + +// First off, you must ALWAYS call stbir_resize_init on your resize structure before any of the other calls! +STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, + const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero + stbir_pixel_layout pixel_layout, stbir_datatype data_type ); + +//=============================================================== +// You can update these parameters any time after resize_init and there is no cost +//-------------------------------- + +STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type ); +STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ); // no callbacks by default +STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ); // pass back STBIR_RESIZE* by default +STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes ); + +//=============================================================== + + +//=============================================================== +// If you call any of these functions, you will trigger a sampler rebuild! +//-------------------------------- + +STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ); // sets new buffer layouts +STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ); // CLAMP by default + +STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ); // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default +STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support ); + +STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets both sub-regions (full regions by default) +STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ); // sets input sub-region (full region by default) +STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ); // sets output sub-region (full region by default) + +// when inputting AND outputting non-premultiplied alpha pixels, we use a slower but higher quality technique +// that fills the zero alpha pixel's RGB values with something plausible. If you don't care about areas of +// zero alpha, you can call this function to get about a 25% speed improvement for STBIR_RGBA to STBIR_RGBA +// types of resizes. +STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality ); +//=============================================================== + + +//=============================================================== +// You can call build_samplers to prebuild all the internal data we need to resample. +// Then, if you call resize_extended many times with the same resize, you only pay the +// cost once. +// If you do call build_samplers, you MUST call free_samplers eventually. +//-------------------------------- + +// This builds the samplers and does one allocation +STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize ); + +// You MUST call this, if you call stbir_build_samplers or stbir_build_samplers_with_splits +STBIRDEF void stbir_free_samplers( STBIR_RESIZE * resize ); +//=============================================================== + + +// And this is the main function to perform the resize synchronously on one thread. +STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize ); + + +//=============================================================== +// Use these functions for multithreading. +// 1) You call stbir_build_samplers_with_splits first on the main thread +// 2) Then stbir_resize_with_split on each thread +// 3) stbir_free_samplers when done on the main thread +//-------------------------------- + +// This will build samplers for threading. +// You can pass in the number of threads you'd like to use (try_splits). +// It returns the number of splits (threads) that you can call it with. +/// It might be less if the image resize can't be split up that many ways. + +STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int try_splits ); + +// This function does a split of the resizing (you call this fuction for each +// split, on multiple threads). A split is a piece of the output resize pixel space. + +// Note that you MUST call stbir_build_samplers_with_splits before stbir_resize_extended_split! + +// Usually, you will always call stbir_resize_split with split_start as the thread_index +// and "1" for the split_count. +// But, if you have a weird situation where you MIGHT want 8 threads, but sometimes +// only 4 threads, you can use 0,2,4,6 for the split_start's and use "2" for the +// split_count each time to turn in into a 4 thread resize. (This is unusual). + +STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count ); +//=============================================================== + + +//=============================================================== +// Pixel Callbacks info: +//-------------------------------- + +// The input callback is super flexible - it calls you with the input address +// (based on the stride and base pointer), it gives you an optional_output +// pointer that you can fill, or you can just return your own pointer into +// your own data. +// +// You can also do conversion from non-supported data types if necessary - in +// this case, you ignore the input_ptr and just use the x and y parameters to +// calculate your own input_ptr based on the size of each non-supported pixel. +// (Something like the third example below.) +// +// You can also install just an input or just an output callback by setting the +// callback that you don't want to zero. +// +// First example, progress: (getting a callback that you can monitor the progress): +// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context ) +// { +// percentage_done = y / input_height; +// return input_ptr; // use buffer from call +// } +// +// Next example, copying: (copy from some other buffer or stream): +// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context ) +// { +// CopyOrStreamData( optional_output, other_data_src, num_pixels * pixel_width_in_bytes ); +// return optional_output; // return the optional buffer that we filled +// } +// +// Third example, input another buffer without copying: (zero-copy from other buffer): +// void const * my_callback( void * optional_output, void const * input_ptr, int num_pixels, int x, int y, void * context ) +// { +// void * pixels = ( (char*) other_image_base ) + ( y * other_image_stride ) + ( x * other_pixel_width_in_bytes ); +// return pixels; // return pointer to your data without copying +// } +// +// +// The output callback is considerably simpler - it just calls you so that you can dump +// out each scanline. You could even directly copy out to disk if you have a simple format +// like TGA or BMP. You can also convert to other output types here if you want. +// +// Simple example: +// void const * my_output( void * output_ptr, int num_pixels, int y, void * context ) +// { +// percentage_done = y / output_height; +// fwrite( output_ptr, pixel_width_in_bytes, num_pixels, output_file ); +// } +//=============================================================== + + + + +//=============================================================== +// optional built-in profiling API +//-------------------------------- + +#ifdef STBIR_PROFILE + +typedef struct STBIR_PROFILE_INFO +{ + stbir_uint64 total_clocks; + + // how many clocks spent (of total_clocks) in the various resize routines, along with a string description + // there are "resize_count" number of zones + stbir_uint64 clocks[ 8 ]; + char const ** descriptions; + + // count of clocks and descriptions + stbir_uint32 count; +} STBIR_PROFILE_INFO; + +// use after calling stbir_resize_extended (or stbir_build_samplers or stbir_build_samplers_with_splits) +STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize ); + +// use after calling stbir_resize_extended +STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize ); + +// use after calling stbir_resize_extended_split +STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * out_info, STBIR_RESIZE const * resize, int split_start, int split_num ); + +//=============================================================== + +#endif + + +//// end header file ///////////////////////////////////////////////////// +#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE2_H + +#if defined(STB_IMAGE_RESIZE_IMPLEMENTATION) || defined(STB_IMAGE_RESIZE2_IMPLEMENTATION) + +#ifndef STBIR_ASSERT +#include +#define STBIR_ASSERT(x) assert(x) +#endif + +#ifndef STBIR_MALLOC +#include +#define STBIR_MALLOC(size,user_data) ((void)(user_data), malloc(size)) +#define STBIR_FREE(ptr,user_data) ((void)(user_data), free(ptr)) +// (we used the comma operator to evaluate user_data, to avoid "unused parameter" warnings) +#endif + +#ifdef _MSC_VER + +#define stbir__inline __forceinline + +#else + +#define stbir__inline __inline__ + +// Clang address sanitizer +#if defined(__has_feature) + #if __has_feature(address_sanitizer) || __has_feature(memory_sanitizer) + #ifndef STBIR__SEPARATE_ALLOCATIONS + #define STBIR__SEPARATE_ALLOCATIONS + #endif + #endif +#endif + +#endif + +// GCC and MSVC +#if defined(__SANITIZE_ADDRESS__) + #ifndef STBIR__SEPARATE_ALLOCATIONS + #define STBIR__SEPARATE_ALLOCATIONS + #endif +#endif + +// Always turn off automatic FMA use - use STBIR_USE_FMA if you want. +// Otherwise, this is a determinism disaster. +#ifndef STBIR_DONT_CHANGE_FP_CONTRACT // override in case you don't want this behavior +#if defined(_MSC_VER) && !defined(__clang__) +#if _MSC_VER > 1200 +#pragma fp_contract(off) +#endif +#elif defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize("fp-contract=off") +#else +#pragma STDC FP_CONTRACT OFF +#endif +#endif + +#ifdef _MSC_VER +#define STBIR__UNUSED(v) (void)(v) +#else +#define STBIR__UNUSED(v) (void)sizeof(v) +#endif + +#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0])) + + +#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE +#define STBIR_DEFAULT_FILTER_UPSAMPLE STBIR_FILTER_CATMULLROM +#endif + +#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE +#define STBIR_DEFAULT_FILTER_DOWNSAMPLE STBIR_FILTER_MITCHELL +#endif + + +#ifndef STBIR__HEADER_FILENAME +#define STBIR__HEADER_FILENAME "stb_image_resize2.h" +#endif + +// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types +// the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible +typedef enum +{ + STBIRI_1CHANNEL = 0, + STBIRI_2CHANNEL = 1, + STBIRI_RGB = 2, + STBIRI_BGR = 3, + STBIRI_4CHANNEL = 4, + + STBIRI_RGBA = 5, + STBIRI_BGRA = 6, + STBIRI_ARGB = 7, + STBIRI_ABGR = 8, + STBIRI_RA = 9, + STBIRI_AR = 10, + + STBIRI_RGBA_PM = 11, + STBIRI_BGRA_PM = 12, + STBIRI_ARGB_PM = 13, + STBIRI_ABGR_PM = 14, + STBIRI_RA_PM = 15, + STBIRI_AR_PM = 16, +} stbir_internal_pixel_layout; + +// define the public pixel layouts to not compile inside the implementation (to avoid accidental use) +#define STBIR_BGR bad_dont_use_in_implementation +#define STBIR_1CHANNEL STBIR_BGR +#define STBIR_2CHANNEL STBIR_BGR +#define STBIR_RGB STBIR_BGR +#define STBIR_RGBA STBIR_BGR +#define STBIR_4CHANNEL STBIR_BGR +#define STBIR_BGRA STBIR_BGR +#define STBIR_ARGB STBIR_BGR +#define STBIR_ABGR STBIR_BGR +#define STBIR_RA STBIR_BGR +#define STBIR_AR STBIR_BGR +#define STBIR_RGBA_PM STBIR_BGR +#define STBIR_BGRA_PM STBIR_BGR +#define STBIR_ARGB_PM STBIR_BGR +#define STBIR_ABGR_PM STBIR_BGR +#define STBIR_RA_PM STBIR_BGR +#define STBIR_AR_PM STBIR_BGR + +// must match stbir_datatype +static unsigned char stbir__type_size[] = { + 1,1,1,2,4,2 // STBIR_TYPE_UINT8,STBIR_TYPE_UINT8_SRGB,STBIR_TYPE_UINT8_SRGB_ALPHA,STBIR_TYPE_UINT16,STBIR_TYPE_FLOAT,STBIR_TYPE_HALF_FLOAT +}; + +// When gathering, the contributors are which source pixels contribute. +// When scattering, the contributors are which destination pixels are contributed to. +typedef struct +{ + int n0; // First contributing pixel + int n1; // Last contributing pixel +} stbir__contributors; + +typedef struct +{ + int lowest; // First sample index for whole filter + int highest; // Last sample index for whole filter + int widest; // widest single set of samples for an output +} stbir__filter_extent_info; + +typedef struct +{ + int n0; // First pixel of decode buffer to write to + int n1; // Last pixel of decode that will be written to + int pixel_offset_for_input; // Pixel offset into input_scanline +} stbir__span; + +typedef struct stbir__scale_info +{ + int input_full_size; + int output_sub_size; + float scale; + float inv_scale; + float pixel_shift; // starting shift in output pixel space (in pixels) + int scale_is_rational; + stbir_uint32 scale_numerator, scale_denominator; +} stbir__scale_info; + +typedef struct +{ + stbir__contributors * contributors; + float* coefficients; + stbir__contributors * gather_prescatter_contributors; + float * gather_prescatter_coefficients; + stbir__scale_info scale_info; + float support; + stbir_filter filter_enum; + stbir__kernel_callback * filter_kernel; + stbir__support_callback * filter_support; + stbir_edge edge; + int coefficient_width; + int filter_pixel_width; + int filter_pixel_margin; + int num_contributors; + int contributors_size; + int coefficients_size; + stbir__filter_extent_info extent_info; + int is_gather; // 0 = scatter, 1 = gather with scale >= 1, 2 = gather with scale < 1 + int gather_prescatter_num_contributors; + int gather_prescatter_coefficient_width; + int gather_prescatter_contributors_size; + int gather_prescatter_coefficients_size; +} stbir__sampler; + +typedef struct +{ + stbir__contributors conservative; + int edge_sizes[2]; // this can be less than filter_pixel_margin, if the filter and scaling falls off + stbir__span spans[2]; // can be two spans, if doing input subrect with clamp mode WRAP +} stbir__extents; + +typedef struct +{ +#ifdef STBIR_PROFILE + union + { + struct { stbir_uint64 total, looping, vertical, horizontal, decode, encode, alpha, unalpha; } named; + stbir_uint64 array[8]; + } profile; + stbir_uint64 * current_zone_excluded_ptr; +#endif + float* decode_buffer; + + int ring_buffer_first_scanline; + int ring_buffer_last_scanline; + int ring_buffer_begin_index; // first_scanline is at this index in the ring buffer + int start_output_y, end_output_y; + int start_input_y, end_input_y; // used in scatter only + + #ifdef STBIR__SEPARATE_ALLOCATIONS + float** ring_buffers; // one pointer for each ring buffer + #else + float* ring_buffer; // one big buffer that we index into + #endif + + float* vertical_buffer; + + char no_cache_straddle[64]; +} stbir__per_split_info; + +typedef float * stbir__decode_pixels_func( float * decode, int width_times_channels, void const * input ); +typedef void stbir__alpha_weight_func( float * decode_buffer, int width_times_channels ); +typedef void stbir__horizontal_gather_channels_func( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, + stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ); +typedef void stbir__alpha_unweight_func(float * encode_buffer, int width_times_channels ); +typedef void stbir__encode_pixels_func( void * output, int width_times_channels, float const * encode ); + +struct stbir__info +{ +#ifdef STBIR_PROFILE + union + { + struct { stbir_uint64 total, build, alloc, horizontal, vertical, cleanup, pivot; } named; + stbir_uint64 array[7]; + } profile; + stbir_uint64 * current_zone_excluded_ptr; +#endif + stbir__sampler horizontal; + stbir__sampler vertical; + + void const * input_data; + void * output_data; + + int input_stride_bytes; + int output_stride_bytes; + int ring_buffer_length_bytes; // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter) + int ring_buffer_num_entries; // Total number of entries in the ring buffer. + + stbir_datatype input_type; + stbir_datatype output_type; + + stbir_input_callback * in_pixels_cb; + void * user_data; + stbir_output_callback * out_pixels_cb; + + stbir__extents scanline_extents; + + void * alloced_mem; + stbir__per_split_info * split_info; // by default 1, but there will be N of these allocated based on the thread init you did + + stbir__decode_pixels_func * decode_pixels; + stbir__alpha_weight_func * alpha_weight; + stbir__horizontal_gather_channels_func * horizontal_gather_channels; + stbir__alpha_unweight_func * alpha_unweight; + stbir__encode_pixels_func * encode_pixels; + + int alloc_ring_buffer_num_entries; // Number of entries in the ring buffer that will be allocated + int splits; // count of splits + + stbir_internal_pixel_layout input_pixel_layout_internal; + stbir_internal_pixel_layout output_pixel_layout_internal; + + int input_color_and_type; + int offset_x, offset_y; // offset within output_data + int vertical_first; + int channels; + int effective_channels; // same as channels, except on RGBA/ARGB (7), or XA/AX (3) + size_t alloced_total; +}; + + +#define stbir__max_uint8_as_float 255.0f +#define stbir__max_uint16_as_float 65535.0f +#define stbir__max_uint8_as_float_inverted 3.9215689e-03f // (1.0f/255.0f) +#define stbir__max_uint16_as_float_inverted 1.5259022e-05f // (1.0f/65535.0f) +#define stbir__small_float ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20)) + +// min/max friendly +#define STBIR_CLAMP(x, xmin, xmax) for(;;) { \ + if ( (x) < (xmin) ) (x) = (xmin); \ + if ( (x) > (xmax) ) (x) = (xmax); \ + break; \ +} + +static stbir__inline int stbir__min(int a, int b) +{ + return a < b ? a : b; +} + +static stbir__inline int stbir__max(int a, int b) +{ + return a > b ? a : b; +} + +static float stbir__srgb_uchar_to_linear_float[256] = { + 0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f, + 0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f, + 0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f, + 0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f, + 0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f, + 0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f, + 0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f, + 0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f, + 0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f, + 0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f, + 0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f, + 0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f, + 0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f, + 0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f, + 0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f, + 0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f, + 0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f, + 0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f, + 0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f, + 0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f, + 0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f, + 0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f, + 0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f, + 0.982251f, 0.991102f, 1.0f +}; + +typedef union +{ + unsigned int u; + float f; +} stbir__FP32; + +// From https://gist.github.com/rygorous/2203834 + +static const stbir_uint32 fp32_to_srgb8_tab4[104] = { + 0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d, + 0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a, + 0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033, + 0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067, + 0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5, + 0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2, + 0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143, + 0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af, + 0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240, + 0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300, + 0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401, + 0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559, + 0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723, +}; + +static stbir__inline stbir_uint8 stbir__linear_to_srgb_uchar(float in) +{ + static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps + static const stbir__FP32 minval = { (127-13) << 23 }; + stbir_uint32 tab,bias,scale,t; + stbir__FP32 f; + + // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively. + // The tests are carefully written so that NaNs map to 0, same as in the reference + // implementation. + if (!(in > minval.f)) // written this way to catch NaNs + return 0; + if (in > almostone.f) + return 255; + + // Do the table lookup and unpack bias, scale + f.f = in; + tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20]; + bias = (tab >> 16) << 9; + scale = tab & 0xffff; + + // Grab next-highest mantissa bits and perform linear interpolation + t = (f.u >> 12) & 0xff; + return (unsigned char) ((bias + scale*t) >> 16); +} + +#ifndef STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT +#define STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT 32 // when downsampling and <= 32 scanlines of buffering, use gather. gather used down to 1/8th scaling for 25% win. +#endif + +#ifndef STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS +#define STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS 4 // when threading, what is the minimum number of scanlines for a split? +#endif + +#define STBIR_INPUT_CALLBACK_PADDING 3 + +#ifdef _M_IX86_FP +#if ( _M_IX86_FP >= 1 ) +#ifndef STBIR_SSE +#define STBIR_SSE +#endif +#endif +#endif + +#ifdef __TINYC__ + // tiny c has no intrinsics yet - this can become a version check if they add them + #define STBIR_NO_SIMD +#endif + +#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(_M_AMD64) || defined(__SSE2__) || defined(STBIR_SSE) || defined(STBIR_SSE2) + #ifndef STBIR_SSE2 + #define STBIR_SSE2 + #endif + #if defined(__AVX__) || defined(STBIR_AVX2) + #ifndef STBIR_AVX + #ifndef STBIR_NO_AVX + #define STBIR_AVX + #endif + #endif + #endif + #if defined(__AVX2__) || defined(STBIR_AVX2) + #ifndef STBIR_NO_AVX2 + #ifndef STBIR_AVX2 + #define STBIR_AVX2 + #endif + #if defined( _MSC_VER ) && !defined(__clang__) + #ifndef STBIR_FP16C // FP16C instructions are on all AVX2 cpus, so we can autoselect it here on microsoft - clang needs -m16c + #define STBIR_FP16C + #endif + #endif + #endif + #endif + #ifdef __F16C__ + #ifndef STBIR_FP16C // turn on FP16C instructions if the define is set (for clang and gcc) + #define STBIR_FP16C + #endif + #endif +#endif + +#if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || ((__ARM_NEON_FP & 4) != 0) || defined(__ARM_NEON__) +#ifndef STBIR_NEON +#define STBIR_NEON +#endif +#endif + +#if defined(_M_ARM) || defined(__arm__) +#ifdef STBIR_USE_FMA +#undef STBIR_USE_FMA // no FMA for 32-bit arm on MSVC +#endif +#endif + +#if defined(__wasm__) && defined(__wasm_simd128__) +#ifndef STBIR_WASM +#define STBIR_WASM +#endif +#endif + +// restrict pointers for the output pointers, other loop and unroll control +#if defined( _MSC_VER ) && !defined(__clang__) + #define STBIR_STREAMOUT_PTR( star ) star __restrict + #define STBIR_NO_UNROLL( ptr ) __assume(ptr) // this oddly keeps msvc from unrolling a loop + #if _MSC_VER >= 1900 + #define STBIR_NO_UNROLL_LOOP_START __pragma(loop( no_vector )) + #else + #define STBIR_NO_UNROLL_LOOP_START + #endif +#elif defined( __clang__ ) + #define STBIR_STREAMOUT_PTR( star ) star __restrict__ + #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) + #if ( __clang_major__ >= 4 ) || ( ( __clang_major__ >= 3 ) && ( __clang_minor__ >= 5 ) ) + #define STBIR_NO_UNROLL_LOOP_START _Pragma("clang loop unroll(disable)") _Pragma("clang loop vectorize(disable)") + #else + #define STBIR_NO_UNROLL_LOOP_START + #endif +#elif defined( __GNUC__ ) + #define STBIR_STREAMOUT_PTR( star ) star __restrict__ + #define STBIR_NO_UNROLL( ptr ) __asm__ (""::"r"(ptr)) + #if __GNUC__ >= 14 + #define STBIR_NO_UNROLL_LOOP_START _Pragma("GCC unroll 0") _Pragma("GCC novector") + #else + #define STBIR_NO_UNROLL_LOOP_START + #endif + #define STBIR_NO_UNROLL_LOOP_START_INF_FOR +#else + #define STBIR_STREAMOUT_PTR( star ) star + #define STBIR_NO_UNROLL( ptr ) + #define STBIR_NO_UNROLL_LOOP_START +#endif + +#ifndef STBIR_NO_UNROLL_LOOP_START_INF_FOR +#define STBIR_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START +#endif + +#ifdef STBIR_NO_SIMD // force simd off for whatever reason + +// force simd off overrides everything else, so clear it all + +#ifdef STBIR_SSE2 +#undef STBIR_SSE2 +#endif + +#ifdef STBIR_AVX +#undef STBIR_AVX +#endif + +#ifdef STBIR_NEON +#undef STBIR_NEON +#endif + +#ifdef STBIR_AVX2 +#undef STBIR_AVX2 +#endif + +#ifdef STBIR_FP16C +#undef STBIR_FP16C +#endif + +#ifdef STBIR_WASM +#undef STBIR_WASM +#endif + +#ifdef STBIR_SIMD +#undef STBIR_SIMD +#endif + +#else // STBIR_SIMD + +#ifdef STBIR_SSE2 + #include + + #define stbir__simdf __m128 + #define stbir__simdi __m128i + + #define stbir_simdi_castf( reg ) _mm_castps_si128(reg) + #define stbir_simdf_casti( reg ) _mm_castsi128_ps(reg) + + #define stbir__simdf_load( reg, ptr ) (reg) = _mm_loadu_ps( (float const*)(ptr) ) + #define stbir__simdi_load( reg, ptr ) (reg) = _mm_loadu_si128 ( (stbir__simdi const*)(ptr) ) + #define stbir__simdf_load1( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf) + #define stbir__simdi_load1( out, ptr ) (out) = _mm_castps_si128( _mm_load_ss( (float const*)(ptr) )) + #define stbir__simdf_load1z( out, ptr ) (out) = _mm_load_ss( (float const*)(ptr) ) // top values must be zero + #define stbir__simdf_frep4( fvar ) _mm_set_ps1( fvar ) + #define stbir__simdf_load1frep4( out, fvar ) (out) = _mm_set_ps1( fvar ) + #define stbir__simdf_load2( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values can be random (not denormal or nan for perf) + #define stbir__simdf_load2z( out, ptr ) (out) = _mm_castsi128_ps( _mm_loadl_epi64( (__m128i*)(ptr)) ) // top values must be zero + #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = _mm_castpd_ps(_mm_loadh_pd( _mm_castps_pd(reg), (double*)(ptr) )) + + #define stbir__simdf_zeroP() _mm_setzero_ps() + #define stbir__simdf_zero( reg ) (reg) = _mm_setzero_ps() + + #define stbir__simdf_store( ptr, reg ) _mm_storeu_ps( (float*)(ptr), reg ) + #define stbir__simdf_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), reg ) + #define stbir__simdf_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), _mm_castps_si128(reg) ) + #define stbir__simdf_store2h( ptr, reg ) _mm_storeh_pd( (double*)(ptr), _mm_castps_pd(reg) ) + + #define stbir__simdi_store( ptr, reg ) _mm_storeu_si128( (__m128i*)(ptr), reg ) + #define stbir__simdi_store1( ptr, reg ) _mm_store_ss( (float*)(ptr), _mm_castsi128_ps(reg) ) + #define stbir__simdi_store2( ptr, reg ) _mm_storel_epi64( (__m128i*)(ptr), (reg) ) + + #define stbir__prefetch( ptr ) _mm_prefetch((char*)(ptr), _MM_HINT_T0 ) + + #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \ + { \ + stbir__simdi zero = _mm_setzero_si128(); \ + out2 = _mm_unpacklo_epi8( ireg, zero ); \ + out3 = _mm_unpackhi_epi8( ireg, zero ); \ + out0 = _mm_unpacklo_epi16( out2, zero ); \ + out1 = _mm_unpackhi_epi16( out2, zero ); \ + out2 = _mm_unpacklo_epi16( out3, zero ); \ + out3 = _mm_unpackhi_epi16( out3, zero ); \ + } + +#define stbir__simdi_expand_u8_to_1u32(out,ireg) \ + { \ + stbir__simdi zero = _mm_setzero_si128(); \ + out = _mm_unpacklo_epi8( ireg, zero ); \ + out = _mm_unpacklo_epi16( out, zero ); \ + } + + #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \ + { \ + stbir__simdi zero = _mm_setzero_si128(); \ + out0 = _mm_unpacklo_epi16( ireg, zero ); \ + out1 = _mm_unpackhi_epi16( ireg, zero ); \ + } + + #define stbir__simdf_convert_float_to_i32( i, f ) (i) = _mm_cvttps_epi32(f) + #define stbir__simdf_convert_float_to_int( f ) _mm_cvtt_ss2si(f) + #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),_mm_setzero_ps())))) + #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)_mm_cvtsi128_si32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())))) + + #define stbir__simdi_to_int( i ) _mm_cvtsi128_si32(i) + #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = _mm_cvtepi32_ps( ireg ) + #define stbir__simdf_add( out, reg0, reg1 ) (out) = _mm_add_ps( reg0, reg1 ) + #define stbir__simdf_mult( out, reg0, reg1 ) (out) = _mm_mul_ps( reg0, reg1 ) + #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = _mm_mul_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) ) + #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = _mm_mul_ss( reg, _mm_load_ss( (float const*)(ptr) ) ) + #define stbir__simdf_add_mem( out, reg, ptr ) (out) = _mm_add_ps( reg, _mm_loadu_ps( (float const*)(ptr) ) ) + #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = _mm_add_ss( reg, _mm_load_ss( (float const*)(ptr) ) ) + + #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd + #include + #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_fmadd_ps( mul1, mul2, add ) + #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_fmadd_ss( mul1, mul2, add ) + #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ps( mul, _mm_loadu_ps( (float const*)(ptr) ), add ) + #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_fmadd_ss( mul, _mm_load_ss( (float const*)(ptr) ), add ) + #else + #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = _mm_add_ps( add, _mm_mul_ps( mul1, mul2 ) ) + #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = _mm_add_ss( add, _mm_mul_ss( mul1, mul2 ) ) + #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = _mm_add_ps( add, _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ) ) + #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = _mm_add_ss( add, _mm_mul_ss( mul, _mm_load_ss( (float const*)(ptr) ) ) ) + #endif + + #define stbir__simdf_add1( out, reg0, reg1 ) (out) = _mm_add_ss( reg0, reg1 ) + #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = _mm_mul_ss( reg0, reg1 ) + + #define stbir__simdf_and( out, reg0, reg1 ) (out) = _mm_and_ps( reg0, reg1 ) + #define stbir__simdf_or( out, reg0, reg1 ) (out) = _mm_or_ps( reg0, reg1 ) + + #define stbir__simdf_min( out, reg0, reg1 ) (out) = _mm_min_ps( reg0, reg1 ) + #define stbir__simdf_max( out, reg0, reg1 ) (out) = _mm_max_ps( reg0, reg1 ) + #define stbir__simdf_min1( out, reg0, reg1 ) (out) = _mm_min_ss( reg0, reg1 ) + #define stbir__simdf_max1( out, reg0, reg1 ) (out) = _mm_max_ss( reg0, reg1 ) + + #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (3<<0) + (0<<2) + (1<<4) + (2<<6) ) ) + #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_shuffle_ps( reg1,reg0, (0<<0) + (1<<2) + (2<<4) + (3<<6) )), (2<<0) + (3<<2) + (0<<4) + (1<<6) ) ) + + static const stbir__simdf STBIR_zeroones = { 0.0f,1.0f,0.0f,1.0f }; + static const stbir__simdf STBIR_onezeros = { 1.0f,0.0f,1.0f,0.0f }; + #define stbir__simdf_aaa1( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movehl_ps( ones, alp ) ), (1<<0) + (1<<2) + (1<<4) + (2<<6) ) ) + #define stbir__simdf_1aaa( out, alp, ones ) (out)=_mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( _mm_movelh_ps( ones, alp ) ), (0<<0) + (2<<2) + (2<<4) + (2<<6) ) ) + #define stbir__simdf_a1a1( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_srli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_zeroones ) + #define stbir__simdf_1a1a( out, alp, ones) (out) = _mm_or_ps( _mm_castsi128_ps( _mm_slli_epi64( _mm_castps_si128(alp), 32 ) ), STBIR_onezeros ) + + #define stbir__simdf_swiz( reg, one, two, three, four ) _mm_castsi128_ps( _mm_shuffle_epi32( _mm_castps_si128( reg ), (one<<0) + (two<<2) + (three<<4) + (four<<6) ) ) + + #define stbir__simdi_and( out, reg0, reg1 ) (out) = _mm_and_si128( reg0, reg1 ) + #define stbir__simdi_or( out, reg0, reg1 ) (out) = _mm_or_si128( reg0, reg1 ) + #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = _mm_madd_epi16( reg0, reg1 ) + + #define stbir__simdf_pack_to_8bytes(out,aa,bb) \ + { \ + stbir__simdf af,bf; \ + stbir__simdi a,b; \ + af = _mm_min_ps( aa, STBIR_max_uint8_as_float ); \ + bf = _mm_min_ps( bb, STBIR_max_uint8_as_float ); \ + af = _mm_max_ps( af, _mm_setzero_ps() ); \ + bf = _mm_max_ps( bf, _mm_setzero_ps() ); \ + a = _mm_cvttps_epi32( af ); \ + b = _mm_cvttps_epi32( bf ); \ + a = _mm_packs_epi32( a, b ); \ + out = _mm_packus_epi16( a, a ); \ + } + + #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \ + stbir__simdf_load( o0, (ptr) ); \ + stbir__simdf_load( o1, (ptr)+4 ); \ + stbir__simdf_load( o2, (ptr)+8 ); \ + stbir__simdf_load( o3, (ptr)+12 ); \ + { \ + __m128 tmp0, tmp1, tmp2, tmp3; \ + tmp0 = _mm_unpacklo_ps(o0, o1); \ + tmp2 = _mm_unpacklo_ps(o2, o3); \ + tmp1 = _mm_unpackhi_ps(o0, o1); \ + tmp3 = _mm_unpackhi_ps(o2, o3); \ + o0 = _mm_movelh_ps(tmp0, tmp2); \ + o1 = _mm_movehl_ps(tmp2, tmp0); \ + o2 = _mm_movelh_ps(tmp1, tmp3); \ + o3 = _mm_movehl_ps(tmp3, tmp1); \ + } + + #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \ + r0 = _mm_packs_epi32( r0, r1 ); \ + r2 = _mm_packs_epi32( r2, r3 ); \ + r1 = _mm_unpacklo_epi16( r0, r2 ); \ + r3 = _mm_unpackhi_epi16( r0, r2 ); \ + r0 = _mm_unpacklo_epi16( r1, r3 ); \ + r2 = _mm_unpackhi_epi16( r1, r3 ); \ + r0 = _mm_packus_epi16( r0, r2 ); \ + stbir__simdi_store( ptr, r0 ); \ + + #define stbir__simdi_32shr( out, reg, imm ) out = _mm_srli_epi32( reg, imm ) + + #if defined(_MSC_VER) && !defined(__clang__) + // msvc inits with 8 bytes + #define STBIR__CONST_32_TO_8( v ) (char)(unsigned char)((v)&255),(char)(unsigned char)(((v)>>8)&255),(char)(unsigned char)(((v)>>16)&255),(char)(unsigned char)(((v)>>24)&255) + #define STBIR__CONST_4_32i( v ) STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ), STBIR__CONST_32_TO_8( v ) + #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) STBIR__CONST_32_TO_8( v0 ), STBIR__CONST_32_TO_8( v1 ), STBIR__CONST_32_TO_8( v2 ), STBIR__CONST_32_TO_8( v3 ) + #else + // everything else inits with long long's + #define STBIR__CONST_4_32i( v ) (long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))),(long long)((((stbir_uint64)(stbir_uint32)(v))<<32)|((stbir_uint64)(stbir_uint32)(v))) + #define STBIR__CONST_4d_32i( v0, v1, v2, v3 ) (long long)((((stbir_uint64)(stbir_uint32)(v1))<<32)|((stbir_uint64)(stbir_uint32)(v0))),(long long)((((stbir_uint64)(stbir_uint32)(v3))<<32)|((stbir_uint64)(stbir_uint32)(v2))) + #endif + + #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x } + #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { STBIR__CONST_4_32i(x) } + #define STBIR__CONSTF(var) (var) + #define STBIR__CONSTI(var) (var) + + #if defined(STBIR_AVX) || defined(__SSE4_1__) + #include + #define stbir__simdf_pack_to_8words(out,reg0,reg1) out = _mm_packus_epi32(_mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())), _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps()))) + #else + static STBIR__SIMDI_CONST(stbir__s32_32768, 32768); + static STBIR__SIMDI_CONST(stbir__s16_32768, ((32768<<16)|32768)); + + #define stbir__simdf_pack_to_8words(out,reg0,reg1) \ + { \ + stbir__simdi tmp0,tmp1; \ + tmp0 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg0,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \ + tmp1 = _mm_cvttps_epi32(_mm_max_ps(_mm_min_ps(reg1,STBIR__CONSTF(STBIR_max_uint16_as_float)),_mm_setzero_ps())); \ + tmp0 = _mm_sub_epi32( tmp0, stbir__s32_32768 ); \ + tmp1 = _mm_sub_epi32( tmp1, stbir__s32_32768 ); \ + out = _mm_packs_epi32( tmp0, tmp1 ); \ + out = _mm_sub_epi16( out, stbir__s16_32768 ); \ + } + + #endif + + #define STBIR_SIMD + + // if we detect AVX, set the simd8 defines + #ifdef STBIR_AVX + #include + #define STBIR_SIMD8 + #define stbir__simdf8 __m256 + #define stbir__simdi8 __m256i + #define stbir__simdf8_load( out, ptr ) (out) = _mm256_loadu_ps( (float const *)(ptr) ) + #define stbir__simdi8_load( out, ptr ) (out) = _mm256_loadu_si256( (__m256i const *)(ptr) ) + #define stbir__simdf8_mult( out, a, b ) (out) = _mm256_mul_ps( (a), (b) ) + #define stbir__simdf8_store( ptr, out ) _mm256_storeu_ps( (float*)(ptr), out ) + #define stbir__simdi8_store( ptr, reg ) _mm256_storeu_si256( (__m256i*)(ptr), reg ) + #define stbir__simdf8_frep8( fval ) _mm256_set1_ps( fval ) + + #define stbir__simdf8_min( out, reg0, reg1 ) (out) = _mm256_min_ps( reg0, reg1 ) + #define stbir__simdf8_max( out, reg0, reg1 ) (out) = _mm256_max_ps( reg0, reg1 ) + + #define stbir__simdf8_add4halves( out, bot4, top8 ) (out) = _mm_add_ps( bot4, _mm256_extractf128_ps( top8, 1 ) ) + #define stbir__simdf8_mult_mem( out, reg, ptr ) (out) = _mm256_mul_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) ) + #define stbir__simdf8_add_mem( out, reg, ptr ) (out) = _mm256_add_ps( reg, _mm256_loadu_ps( (float const*)(ptr) ) ) + #define stbir__simdf8_add( out, a, b ) (out) = _mm256_add_ps( a, b ) + #define stbir__simdf8_load1b( out, ptr ) (out) = _mm256_broadcast_ss( ptr ) + #define stbir__simdf_load1rep4( out, ptr ) (out) = _mm_broadcast_ss( ptr ) // avx load instruction + + #define stbir__simdi8_convert_i32_to_float(out, ireg) (out) = _mm256_cvtepi32_ps( ireg ) + #define stbir__simdf8_convert_float_to_i32( i, f ) (i) = _mm256_cvttps_epi32(f) + + #define stbir__simdf8_bot4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (0<<0)+(2<<4) ) + #define stbir__simdf8_top4s( out, a, b ) (out) = _mm256_permute2f128_ps(a,b, (1<<0)+(3<<4) ) + + #define stbir__simdf8_gettop4( reg ) _mm256_extractf128_ps(reg,1) + + #ifdef STBIR_AVX2 + + #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \ + { \ + stbir__simdi8 a, zero =_mm256_setzero_si256();\ + a = _mm256_permute4x64_epi64( _mm256_unpacklo_epi8( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), zero ),(0<<0)+(2<<2)+(1<<4)+(3<<6)); \ + out0 = _mm256_unpacklo_epi16( a, zero ); \ + out1 = _mm256_unpackhi_epi16( a, zero ); \ + } + + #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \ + { \ + stbir__simdi8 t; \ + stbir__simdf8 af,bf; \ + stbir__simdi8 a,b; \ + af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \ + bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \ + af = _mm256_max_ps( af, _mm256_setzero_ps() ); \ + bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \ + a = _mm256_cvttps_epi32( af ); \ + b = _mm256_cvttps_epi32( bf ); \ + t = _mm256_permute4x64_epi64( _mm256_packs_epi32( a, b ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \ + out = _mm256_castsi256_si128( _mm256_permute4x64_epi64( _mm256_packus_epi16( t, t ), (0<<0)+(2<<2)+(1<<4)+(3<<6) ) ); \ + } + + #define stbir__simdi8_expand_u16_to_u32(out,ireg) out = _mm256_unpacklo_epi16( _mm256_permute4x64_epi64(_mm256_castsi128_si256(ireg),(0<<0)+(2<<2)+(1<<4)+(3<<6)), _mm256_setzero_si256() ); + + #define stbir__simdf8_pack_to_16words(out,aa,bb) \ + { \ + stbir__simdf8 af,bf; \ + stbir__simdi8 a,b; \ + af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \ + bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \ + af = _mm256_max_ps( af, _mm256_setzero_ps() ); \ + bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \ + a = _mm256_cvttps_epi32( af ); \ + b = _mm256_cvttps_epi32( bf ); \ + (out) = _mm256_permute4x64_epi64( _mm256_packus_epi32(a, b), (0<<0)+(2<<2)+(1<<4)+(3<<6) ); \ + } + + #else + + #define stbir__simdi8_expand_u8_to_u32(out0,out1,ireg) \ + { \ + stbir__simdi a,zero = _mm_setzero_si128(); \ + a = _mm_unpacklo_epi8( ireg, zero ); \ + out0 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \ + a = _mm_unpackhi_epi8( ireg, zero ); \ + out1 = _mm256_setr_m128i( _mm_unpacklo_epi16( a, zero ), _mm_unpackhi_epi16( a, zero ) ); \ + } + + #define stbir__simdf8_pack_to_16bytes(out,aa,bb) \ + { \ + stbir__simdi t; \ + stbir__simdf8 af,bf; \ + stbir__simdi8 a,b; \ + af = _mm256_min_ps( aa, STBIR_max_uint8_as_floatX ); \ + bf = _mm256_min_ps( bb, STBIR_max_uint8_as_floatX ); \ + af = _mm256_max_ps( af, _mm256_setzero_ps() ); \ + bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \ + a = _mm256_cvttps_epi32( af ); \ + b = _mm256_cvttps_epi32( bf ); \ + out = _mm_packs_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \ + out = _mm_packus_epi16( out, out ); \ + t = _mm_packs_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \ + t = _mm_packus_epi16( t, t ); \ + out = _mm_castps_si128( _mm_shuffle_ps( _mm_castsi128_ps(out), _mm_castsi128_ps(t), (0<<0)+(1<<2)+(0<<4)+(1<<6) ) ); \ + } + + #define stbir__simdi8_expand_u16_to_u32(out,ireg) \ + { \ + stbir__simdi a,b,zero = _mm_setzero_si128(); \ + a = _mm_unpacklo_epi16( ireg, zero ); \ + b = _mm_unpackhi_epi16( ireg, zero ); \ + out = _mm256_insertf128_si256( _mm256_castsi128_si256( a ), b, 1 ); \ + } + + #define stbir__simdf8_pack_to_16words(out,aa,bb) \ + { \ + stbir__simdi t0,t1; \ + stbir__simdf8 af,bf; \ + stbir__simdi8 a,b; \ + af = _mm256_min_ps( aa, STBIR_max_uint16_as_floatX ); \ + bf = _mm256_min_ps( bb, STBIR_max_uint16_as_floatX ); \ + af = _mm256_max_ps( af, _mm256_setzero_ps() ); \ + bf = _mm256_max_ps( bf, _mm256_setzero_ps() ); \ + a = _mm256_cvttps_epi32( af ); \ + b = _mm256_cvttps_epi32( bf ); \ + t0 = _mm_packus_epi32( _mm256_castsi256_si128(a), _mm256_extractf128_si256( a, 1 ) ); \ + t1 = _mm_packus_epi32( _mm256_castsi256_si128(b), _mm256_extractf128_si256( b, 1 ) ); \ + out = _mm256_setr_m128i( t0, t1 ); \ + } + + #endif + + static __m256i stbir_00001111 = { STBIR__CONST_4d_32i( 0, 0, 0, 0 ), STBIR__CONST_4d_32i( 1, 1, 1, 1 ) }; + #define stbir__simdf8_0123to00001111( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00001111 ) + + static __m256i stbir_22223333 = { STBIR__CONST_4d_32i( 2, 2, 2, 2 ), STBIR__CONST_4d_32i( 3, 3, 3, 3 ) }; + #define stbir__simdf8_0123to22223333( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_22223333 ) + + #define stbir__simdf8_0123to2222( out, in ) (out) = stbir__simdf_swiz(_mm256_castps256_ps128(in), 2,2,2,2 ) + + #define stbir__simdf8_load4b( out, ptr ) (out) = _mm256_broadcast_ps( (__m128 const *)(ptr) ) + + static __m256i stbir_00112233 = { STBIR__CONST_4d_32i( 0, 0, 1, 1 ), STBIR__CONST_4d_32i( 2, 2, 3, 3 ) }; + #define stbir__simdf8_0123to00112233( out, in ) (out) = _mm256_permutevar_ps ( in, stbir_00112233 ) + #define stbir__simdf8_add4( out, a8, b ) (out) = _mm256_add_ps( a8, _mm256_castps128_ps256( b ) ) + + static __m256i stbir_load6 = { STBIR__CONST_4_32i( 0x80000000 ), STBIR__CONST_4d_32i( 0x80000000, 0x80000000, 0, 0 ) }; + #define stbir__simdf8_load6z( out, ptr ) (out) = _mm256_maskload_ps( ptr, stbir_load6 ) + + #define stbir__simdf8_0123to00000000( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(0<<4)+(0<<6) ) + #define stbir__simdf8_0123to11111111( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(1<<4)+(1<<6) ) + #define stbir__simdf8_0123to22222222( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(2<<2)+(2<<4)+(2<<6) ) + #define stbir__simdf8_0123to33333333( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(3<<2)+(3<<4)+(3<<6) ) + #define stbir__simdf8_0123to21032103( out, in ) (out) = _mm256_shuffle_ps ( in, in, (2<<0)+(1<<2)+(0<<4)+(3<<6) ) + #define stbir__simdf8_0123to32103210( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(2<<2)+(1<<4)+(0<<6) ) + #define stbir__simdf8_0123to12301230( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(2<<2)+(3<<4)+(0<<6) ) + #define stbir__simdf8_0123to10321032( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(0<<2)+(3<<4)+(2<<6) ) + #define stbir__simdf8_0123to30123012( out, in ) (out) = _mm256_shuffle_ps ( in, in, (3<<0)+(0<<2)+(1<<4)+(2<<6) ) + + #define stbir__simdf8_0123to11331133( out, in ) (out) = _mm256_shuffle_ps ( in, in, (1<<0)+(1<<2)+(3<<4)+(3<<6) ) + #define stbir__simdf8_0123to00220022( out, in ) (out) = _mm256_shuffle_ps ( in, in, (0<<0)+(0<<2)+(2<<4)+(2<<6) ) + + #define stbir__simdf8_aaa1( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(1<<1)+(1<<2)+(0<<3)+(1<<4)+(1<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (3<<0) + (3<<2) + (3<<4) + (0<<6) ) + #define stbir__simdf8_1aaa( out, alp, ones ) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(1<<2)+(1<<3)+(0<<4)+(1<<5)+(1<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (0<<4) + (0<<6) ) + #define stbir__simdf8_a1a1( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (1<<0)+(0<<1)+(1<<2)+(0<<3)+(1<<4)+(0<<5)+(1<<6)+(0<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) ) + #define stbir__simdf8_1a1a( out, alp, ones) (out) = _mm256_blend_ps( alp, ones, (0<<0)+(1<<1)+(0<<2)+(1<<3)+(0<<4)+(1<<5)+(0<<6)+(1<<7)); (out)=_mm256_shuffle_ps( out,out, (1<<0) + (0<<2) + (3<<4) + (2<<6) ) + + #define stbir__simdf8_zero( reg ) (reg) = _mm256_setzero_ps() + + #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd + #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_fmadd_ps( mul1, mul2, add ) + #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_fmadd_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ), add ) + #define stbir__simdf8_madd_mem4( out, add, mul, ptr )(out) = _mm256_fmadd_ps( _mm256_setr_m128( mul, _mm_setzero_ps() ), _mm256_setr_m128( _mm_loadu_ps( (float const*)(ptr) ), _mm_setzero_ps() ), add ) + #else + #define stbir__simdf8_madd( out, add, mul1, mul2 ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul1, mul2 ) ) + #define stbir__simdf8_madd_mem( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_mul_ps( mul, _mm256_loadu_ps( (float const*)(ptr) ) ) ) + #define stbir__simdf8_madd_mem4( out, add, mul, ptr ) (out) = _mm256_add_ps( add, _mm256_setr_m128( _mm_mul_ps( mul, _mm_loadu_ps( (float const*)(ptr) ) ), _mm_setzero_ps() ) ) + #endif + #define stbir__if_simdf8_cast_to_simdf4( val ) _mm256_castps256_ps128( val ) + + #endif + + #ifdef STBIR_FLOORF + #undef STBIR_FLOORF + #endif + #define STBIR_FLOORF stbir_simd_floorf + static stbir__inline float stbir_simd_floorf(float x) // martins floorf + { + #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41) + __m128 t = _mm_set_ss(x); + return _mm_cvtss_f32( _mm_floor_ss(t, t) ); + #else + __m128 f = _mm_set_ss(x); + __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f)); + __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(f, t), _mm_set_ss(-1.0f))); + return _mm_cvtss_f32(r); + #endif + } + + #ifdef STBIR_CEILF + #undef STBIR_CEILF + #endif + #define STBIR_CEILF stbir_simd_ceilf + static stbir__inline float stbir_simd_ceilf(float x) // martins ceilf + { + #if defined(STBIR_AVX) || defined(__SSE4_1__) || defined(STBIR_SSE41) + __m128 t = _mm_set_ss(x); + return _mm_cvtss_f32( _mm_ceil_ss(t, t) ); + #else + __m128 f = _mm_set_ss(x); + __m128 t = _mm_cvtepi32_ps(_mm_cvttps_epi32(f)); + __m128 r = _mm_add_ss(t, _mm_and_ps(_mm_cmplt_ss(t, f), _mm_set_ss(1.0f))); + return _mm_cvtss_f32(r); + #endif + } + +#elif defined(STBIR_NEON) + + #include + + #define stbir__simdf float32x4_t + #define stbir__simdi uint32x4_t + + #define stbir_simdi_castf( reg ) vreinterpretq_u32_f32(reg) + #define stbir_simdf_casti( reg ) vreinterpretq_f32_u32(reg) + + #define stbir__simdf_load( reg, ptr ) (reg) = vld1q_f32( (float const*)(ptr) ) + #define stbir__simdi_load( reg, ptr ) (reg) = vld1q_u32( (uint32_t const*)(ptr) ) + #define stbir__simdf_load1( out, ptr ) (out) = vld1q_dup_f32( (float const*)(ptr) ) // top values can be random (not denormal or nan for perf) + #define stbir__simdi_load1( out, ptr ) (out) = vld1q_dup_u32( (uint32_t const*)(ptr) ) + #define stbir__simdf_load1z( out, ptr ) (out) = vld1q_lane_f32( (float const*)(ptr), vdupq_n_f32(0), 0 ) // top values must be zero + #define stbir__simdf_frep4( fvar ) vdupq_n_f32( fvar ) + #define stbir__simdf_load1frep4( out, fvar ) (out) = vdupq_n_f32( fvar ) + #define stbir__simdf_load2( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values can be random (not denormal or nan for perf) + #define stbir__simdf_load2z( out, ptr ) (out) = vcombine_f32( vld1_f32( (float const*)(ptr) ), vcreate_f32(0) ) // top values must be zero + #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = vcombine_f32( vget_low_f32(reg), vld1_f32( (float const*)(ptr) ) ) + + #define stbir__simdf_zeroP() vdupq_n_f32(0) + #define stbir__simdf_zero( reg ) (reg) = vdupq_n_f32(0) + + #define stbir__simdf_store( ptr, reg ) vst1q_f32( (float*)(ptr), reg ) + #define stbir__simdf_store1( ptr, reg ) vst1q_lane_f32( (float*)(ptr), reg, 0) + #define stbir__simdf_store2( ptr, reg ) vst1_f32( (float*)(ptr), vget_low_f32(reg) ) + #define stbir__simdf_store2h( ptr, reg ) vst1_f32( (float*)(ptr), vget_high_f32(reg) ) + + #define stbir__simdi_store( ptr, reg ) vst1q_u32( (uint32_t*)(ptr), reg ) + #define stbir__simdi_store1( ptr, reg ) vst1q_lane_u32( (uint32_t*)(ptr), reg, 0 ) + #define stbir__simdi_store2( ptr, reg ) vst1_u32( (uint32_t*)(ptr), vget_low_u32(reg) ) + + #define stbir__prefetch( ptr ) + + #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \ + { \ + uint16x8_t l = vmovl_u8( vget_low_u8 ( vreinterpretq_u8_u32(ireg) ) ); \ + uint16x8_t h = vmovl_u8( vget_high_u8( vreinterpretq_u8_u32(ireg) ) ); \ + out0 = vmovl_u16( vget_low_u16 ( l ) ); \ + out1 = vmovl_u16( vget_high_u16( l ) ); \ + out2 = vmovl_u16( vget_low_u16 ( h ) ); \ + out3 = vmovl_u16( vget_high_u16( h ) ); \ + } + + #define stbir__simdi_expand_u8_to_1u32(out,ireg) \ + { \ + uint16x8_t tmp = vmovl_u8( vget_low_u8( vreinterpretq_u8_u32(ireg) ) ); \ + out = vmovl_u16( vget_low_u16( tmp ) ); \ + } + + #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \ + { \ + uint16x8_t tmp = vreinterpretq_u16_u32(ireg); \ + out0 = vmovl_u16( vget_low_u16 ( tmp ) ); \ + out1 = vmovl_u16( vget_high_u16( tmp ) ); \ + } + + #define stbir__simdf_convert_float_to_i32( i, f ) (i) = vreinterpretq_u32_s32( vcvtq_s32_f32(f) ) + #define stbir__simdf_convert_float_to_int( f ) vgetq_lane_s32(vcvtq_s32_f32(f), 0) + #define stbir__simdi_to_int( i ) (int)vgetq_lane_u32(i, 0) + #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint8_as_float)),vdupq_n_f32(0))), 0)) + #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)vgetq_lane_s32(vcvtq_s32_f32(vmaxq_f32(vminq_f32(f,STBIR__CONSTF(STBIR_max_uint16_as_float)),vdupq_n_f32(0))), 0)) + #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = vcvtq_f32_s32( vreinterpretq_s32_u32(ireg) ) + #define stbir__simdf_add( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 ) + #define stbir__simdf_mult( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 ) + #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_f32( (float const*)(ptr) ) ) + #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = vmulq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) ) + #define stbir__simdf_add_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_f32( (float const*)(ptr) ) ) + #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = vaddq_f32( reg, vld1q_dup_f32( (float const*)(ptr) ) ) + + #ifdef STBIR_USE_FMA // not on by default to maintain bit identical simd to non-simd (and also x64 no madd to arm madd) + #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 ) + #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vfmaq_f32( add, mul1, mul2 ) + #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_f32( (float const*)(ptr) ) ) + #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vfmaq_f32( add, mul, vld1q_dup_f32( (float const*)(ptr) ) ) + #else + #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) ) + #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = vaddq_f32( add, vmulq_f32( mul1, mul2 ) ) + #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_f32( (float const*)(ptr) ) ) ) + #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = vaddq_f32( add, vmulq_f32( mul, vld1q_dup_f32( (float const*)(ptr) ) ) ) + #endif + + #define stbir__simdf_add1( out, reg0, reg1 ) (out) = vaddq_f32( reg0, reg1 ) + #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = vmulq_f32( reg0, reg1 ) + + #define stbir__simdf_and( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vandq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) ) + #define stbir__simdf_or( out, reg0, reg1 ) (out) = vreinterpretq_f32_u32( vorrq_u32( vreinterpretq_u32_f32(reg0), vreinterpretq_u32_f32(reg1) ) ) + + #define stbir__simdf_min( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 ) + #define stbir__simdf_max( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 ) + #define stbir__simdf_min1( out, reg0, reg1 ) (out) = vminq_f32( reg0, reg1 ) + #define stbir__simdf_max1( out, reg0, reg1 ) (out) = vmaxq_f32( reg0, reg1 ) + + #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 3 ) + #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = vextq_f32( reg0, reg1, 2 ) + + #define stbir__simdf_a1a1( out, alp, ones ) (out) = vzipq_f32(vuzpq_f32(alp, alp).val[1], ones).val[0] + #define stbir__simdf_1a1a( out, alp, ones ) (out) = vzipq_f32(ones, vuzpq_f32(alp, alp).val[0]).val[0] + + #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) + + #define stbir__simdf_aaa1( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3, ones, 3) + #define stbir__simdf_1aaa( out, alp, ones ) (out) = vcopyq_laneq_f32(vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0, ones, 0) + + #if defined( _MSC_VER ) && !defined(__clang__) + #define stbir_make16(a,b,c,d) vcombine_u8( \ + vcreate_u8( (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \ + ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56)), \ + vcreate_u8( (4*c+0) | ((4*c+1)<<8) | ((4*c+2)<<16) | ((4*c+3)<<24) | \ + ((stbir_uint64)(4*d+0)<<32) | ((stbir_uint64)(4*d+1)<<40) | ((stbir_uint64)(4*d+2)<<48) | ((stbir_uint64)(4*d+3)<<56) ) ) + + static stbir__inline uint8x16x2_t stbir_make16x2(float32x4_t rega,float32x4_t regb) + { + uint8x16x2_t r = { vreinterpretq_u8_f32(rega), vreinterpretq_u8_f32(regb) }; + return r; + } + #else + #define stbir_make16(a,b,c,d) (uint8x16_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3,4*c+0,4*c+1,4*c+2,4*c+3,4*d+0,4*d+1,4*d+2,4*d+3} + #define stbir_make16x2(a,b) (uint8x16x2_t){{vreinterpretq_u8_f32(a),vreinterpretq_u8_f32(b)}} + #endif + + #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vqtbl1q_u8( vreinterpretq_u8_f32(reg), stbir_make16(one, two, three, four) ) ) + #define stbir__simdf_swiz2( rega, regb, one, two, three, four ) vreinterpretq_f32_u8( vqtbl2q_u8( stbir_make16x2(rega,regb), stbir_make16(one, two, three, four) ) ) + + #define stbir__simdi_16madd( out, reg0, reg1 ) \ + { \ + int16x8_t r0 = vreinterpretq_s16_u32(reg0); \ + int16x8_t r1 = vreinterpretq_s16_u32(reg1); \ + int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \ + int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \ + (out) = vreinterpretq_u32_s32( vpaddq_s32(tmp0, tmp1) ); \ + } + + #else + + #define stbir__simdf_aaa1( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 3)), 3) + #define stbir__simdf_1aaa( out, alp, ones ) (out) = vsetq_lane_f32(1.0f, vdupq_n_f32(vgetq_lane_f32(alp, 0)), 0) + + #if defined( _MSC_VER ) && !defined(__clang__) + static stbir__inline uint8x8x2_t stbir_make8x2(float32x4_t reg) + { + uint8x8x2_t r = { { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } }; + return r; + } + #define stbir_make8(a,b) vcreate_u8( \ + (4*a+0) | ((4*a+1)<<8) | ((4*a+2)<<16) | ((4*a+3)<<24) | \ + ((stbir_uint64)(4*b+0)<<32) | ((stbir_uint64)(4*b+1)<<40) | ((stbir_uint64)(4*b+2)<<48) | ((stbir_uint64)(4*b+3)<<56) ) + #else + #define stbir_make8x2(reg) (uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_f32(reg)), vget_high_u8(vreinterpretq_u8_f32(reg)) } } + #define stbir_make8(a,b) (uint8x8_t){4*a+0,4*a+1,4*a+2,4*a+3,4*b+0,4*b+1,4*b+2,4*b+3} + #endif + + #define stbir__simdf_swiz( reg, one, two, three, four ) vreinterpretq_f32_u8( vcombine_u8( \ + vtbl2_u8( stbir_make8x2( reg ), stbir_make8( one, two ) ), \ + vtbl2_u8( stbir_make8x2( reg ), stbir_make8( three, four ) ) ) ) + + #define stbir__simdi_16madd( out, reg0, reg1 ) \ + { \ + int16x8_t r0 = vreinterpretq_s16_u32(reg0); \ + int16x8_t r1 = vreinterpretq_s16_u32(reg1); \ + int32x4_t tmp0 = vmull_s16( vget_low_s16(r0), vget_low_s16(r1) ); \ + int32x4_t tmp1 = vmull_s16( vget_high_s16(r0), vget_high_s16(r1) ); \ + int32x2_t out0 = vpadd_s32( vget_low_s32(tmp0), vget_high_s32(tmp0) ); \ + int32x2_t out1 = vpadd_s32( vget_low_s32(tmp1), vget_high_s32(tmp1) ); \ + (out) = vreinterpretq_u32_s32( vcombine_s32(out0, out1) ); \ + } + + #endif + + #define stbir__simdi_and( out, reg0, reg1 ) (out) = vandq_u32( reg0, reg1 ) + #define stbir__simdi_or( out, reg0, reg1 ) (out) = vorrq_u32( reg0, reg1 ) + + #define stbir__simdf_pack_to_8bytes(out,aa,bb) \ + { \ + float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \ + float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint8_as_float) ), vdupq_n_f32(0) ); \ + int16x4_t ai = vqmovn_s32( vcvtq_s32_f32( af ) ); \ + int16x4_t bi = vqmovn_s32( vcvtq_s32_f32( bf ) ); \ + uint8x8_t out8 = vqmovun_s16( vcombine_s16(ai, bi) ); \ + out = vreinterpretq_u32_u8( vcombine_u8(out8, out8) ); \ + } + + #define stbir__simdf_pack_to_8words(out,aa,bb) \ + { \ + float32x4_t af = vmaxq_f32( vminq_f32(aa,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \ + float32x4_t bf = vmaxq_f32( vminq_f32(bb,STBIR__CONSTF(STBIR_max_uint16_as_float) ), vdupq_n_f32(0) ); \ + int32x4_t ai = vcvtq_s32_f32( af ); \ + int32x4_t bi = vcvtq_s32_f32( bf ); \ + out = vreinterpretq_u32_u16( vcombine_u16(vqmovun_s32(ai), vqmovun_s32(bi)) ); \ + } + + #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \ + { \ + int16x4x2_t tmp0 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r0)), vqmovn_s32(vreinterpretq_s32_u32(r2)) ); \ + int16x4x2_t tmp1 = vzip_s16( vqmovn_s32(vreinterpretq_s32_u32(r1)), vqmovn_s32(vreinterpretq_s32_u32(r3)) ); \ + uint8x8x2_t out = \ + { { \ + vqmovun_s16( vcombine_s16(tmp0.val[0], tmp0.val[1]) ), \ + vqmovun_s16( vcombine_s16(tmp1.val[0], tmp1.val[1]) ), \ + } }; \ + vst2_u8(ptr, out); \ + } + + #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \ + { \ + float32x4x4_t tmp = vld4q_f32(ptr); \ + o0 = tmp.val[0]; \ + o1 = tmp.val[1]; \ + o2 = tmp.val[2]; \ + o3 = tmp.val[3]; \ + } + + #define stbir__simdi_32shr( out, reg, imm ) out = vshrq_n_u32( reg, imm ) + + #if defined( _MSC_VER ) && !defined(__clang__) + #define STBIR__SIMDF_CONST(var, x) __declspec(align(8)) float var[] = { x, x, x, x } + #define STBIR__SIMDI_CONST(var, x) __declspec(align(8)) uint32_t var[] = { x, x, x, x } + #define STBIR__CONSTF(var) (*(const float32x4_t*)var) + #define STBIR__CONSTI(var) (*(const uint32x4_t*)var) + #else + #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = { x, x, x, x } + #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x } + #define STBIR__CONSTF(var) (var) + #define STBIR__CONSTI(var) (var) + #endif + + #ifdef STBIR_FLOORF + #undef STBIR_FLOORF + #endif + #define STBIR_FLOORF stbir_simd_floorf + static stbir__inline float stbir_simd_floorf(float x) + { + #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) + return vget_lane_f32( vrndm_f32( vdup_n_f32(x) ), 0); + #else + float32x2_t f = vdup_n_f32(x); + float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f)); + uint32x2_t a = vclt_f32(f, t); + uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(-1.0f)); + float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b))); + return vget_lane_f32(r, 0); + #endif + } + + #ifdef STBIR_CEILF + #undef STBIR_CEILF + #endif + #define STBIR_CEILF stbir_simd_ceilf + static stbir__inline float stbir_simd_ceilf(float x) + { + #if defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) + return vget_lane_f32( vrndp_f32( vdup_n_f32(x) ), 0); + #else + float32x2_t f = vdup_n_f32(x); + float32x2_t t = vcvt_f32_s32(vcvt_s32_f32(f)); + uint32x2_t a = vclt_f32(t, f); + uint32x2_t b = vreinterpret_u32_f32(vdup_n_f32(1.0f)); + float32x2_t r = vadd_f32(t, vreinterpret_f32_u32(vand_u32(a, b))); + return vget_lane_f32(r, 0); + #endif + } + + #define STBIR_SIMD + +#elif defined(STBIR_WASM) + + #include + + #define stbir__simdf v128_t + #define stbir__simdi v128_t + + #define stbir_simdi_castf( reg ) (reg) + #define stbir_simdf_casti( reg ) (reg) + + #define stbir__simdf_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) ) + #define stbir__simdi_load( reg, ptr ) (reg) = wasm_v128_load( (void const*)(ptr) ) + #define stbir__simdf_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf) + #define stbir__simdi_load1( out, ptr ) (out) = wasm_v128_load32_splat( (void const*)(ptr) ) + #define stbir__simdf_load1z( out, ptr ) (out) = wasm_v128_load32_zero( (void const*)(ptr) ) // top values must be zero + #define stbir__simdf_frep4( fvar ) wasm_f32x4_splat( fvar ) + #define stbir__simdf_load1frep4( out, fvar ) (out) = wasm_f32x4_splat( fvar ) + #define stbir__simdf_load2( out, ptr ) (out) = wasm_v128_load64_splat( (void const*)(ptr) ) // top values can be random (not denormal or nan for perf) + #define stbir__simdf_load2z( out, ptr ) (out) = wasm_v128_load64_zero( (void const*)(ptr) ) // top values must be zero + #define stbir__simdf_load2hmerge( out, reg, ptr ) (out) = wasm_v128_load64_lane( (void const*)(ptr), reg, 1 ) + + #define stbir__simdf_zeroP() wasm_f32x4_const_splat(0) + #define stbir__simdf_zero( reg ) (reg) = wasm_f32x4_const_splat(0) + + #define stbir__simdf_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg ) + #define stbir__simdf_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 ) + #define stbir__simdf_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 ) + #define stbir__simdf_store2h( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 1 ) + + #define stbir__simdi_store( ptr, reg ) wasm_v128_store( (void*)(ptr), reg ) + #define stbir__simdi_store1( ptr, reg ) wasm_v128_store32_lane( (void*)(ptr), reg, 0 ) + #define stbir__simdi_store2( ptr, reg ) wasm_v128_store64_lane( (void*)(ptr), reg, 0 ) + + #define stbir__prefetch( ptr ) + + #define stbir__simdi_expand_u8_to_u32(out0,out1,out2,out3,ireg) \ + { \ + v128_t l = wasm_u16x8_extend_low_u8x16 ( ireg ); \ + v128_t h = wasm_u16x8_extend_high_u8x16( ireg ); \ + out0 = wasm_u32x4_extend_low_u16x8 ( l ); \ + out1 = wasm_u32x4_extend_high_u16x8( l ); \ + out2 = wasm_u32x4_extend_low_u16x8 ( h ); \ + out3 = wasm_u32x4_extend_high_u16x8( h ); \ + } + + #define stbir__simdi_expand_u8_to_1u32(out,ireg) \ + { \ + v128_t tmp = wasm_u16x8_extend_low_u8x16(ireg); \ + out = wasm_u32x4_extend_low_u16x8(tmp); \ + } + + #define stbir__simdi_expand_u16_to_u32(out0,out1,ireg) \ + { \ + out0 = wasm_u32x4_extend_low_u16x8 ( ireg ); \ + out1 = wasm_u32x4_extend_high_u16x8( ireg ); \ + } + + #define stbir__simdf_convert_float_to_i32( i, f ) (i) = wasm_i32x4_trunc_sat_f32x4(f) + #define stbir__simdf_convert_float_to_int( f ) wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(f), 0) + #define stbir__simdi_to_int( i ) wasm_i32x4_extract_lane(i, 0) + #define stbir__simdf_convert_float_to_uint8( f ) ((unsigned char)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint8_as_float),wasm_f32x4_const_splat(0))), 0)) + #define stbir__simdf_convert_float_to_short( f ) ((unsigned short)wasm_i32x4_extract_lane(wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_max(wasm_f32x4_min(f,STBIR_max_uint16_as_float),wasm_f32x4_const_splat(0))), 0)) + #define stbir__simdi_convert_i32_to_float(out, ireg) (out) = wasm_f32x4_convert_i32x4(ireg) + #define stbir__simdf_add( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 ) + #define stbir__simdf_mult( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 ) + #define stbir__simdf_mult_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load( (void const*)(ptr) ) ) + #define stbir__simdf_mult1_mem( out, reg, ptr ) (out) = wasm_f32x4_mul( reg, wasm_v128_load32_splat( (void const*)(ptr) ) ) + #define stbir__simdf_add_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load( (void const*)(ptr) ) ) + #define stbir__simdf_add1_mem( out, reg, ptr ) (out) = wasm_f32x4_add( reg, wasm_v128_load32_splat( (void const*)(ptr) ) ) + + #define stbir__simdf_madd( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) ) + #define stbir__simdf_madd1( out, add, mul1, mul2 ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul1, mul2 ) ) + #define stbir__simdf_madd_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load( (void const*)(ptr) ) ) ) + #define stbir__simdf_madd1_mem( out, add, mul, ptr ) (out) = wasm_f32x4_add( add, wasm_f32x4_mul( mul, wasm_v128_load32_splat( (void const*)(ptr) ) ) ) + + #define stbir__simdf_add1( out, reg0, reg1 ) (out) = wasm_f32x4_add( reg0, reg1 ) + #define stbir__simdf_mult1( out, reg0, reg1 ) (out) = wasm_f32x4_mul( reg0, reg1 ) + + #define stbir__simdf_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 ) + #define stbir__simdf_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 ) + + #define stbir__simdf_min( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 ) + #define stbir__simdf_max( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 ) + #define stbir__simdf_min1( out, reg0, reg1 ) (out) = wasm_f32x4_min( reg0, reg1 ) + #define stbir__simdf_max1( out, reg0, reg1 ) (out) = wasm_f32x4_max( reg0, reg1 ) + + #define stbir__simdf_0123ABCDto3ABx( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 3, 4, 5, -1 ) + #define stbir__simdf_0123ABCDto23Ax( out, reg0, reg1 ) (out) = wasm_i32x4_shuffle( reg0, reg1, 2, 3, 4, -1 ) + + #define stbir__simdf_aaa1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 3, 3, 3, 4) + #define stbir__simdf_1aaa(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 0, 0) + #define stbir__simdf_a1a1(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 1, 4, 3, 4) + #define stbir__simdf_1a1a(out,alp,ones) (out) = wasm_i32x4_shuffle(alp, ones, 4, 0, 4, 2) + + #define stbir__simdf_swiz( reg, one, two, three, four ) wasm_i32x4_shuffle(reg, reg, one, two, three, four) + + #define stbir__simdi_and( out, reg0, reg1 ) (out) = wasm_v128_and( reg0, reg1 ) + #define stbir__simdi_or( out, reg0, reg1 ) (out) = wasm_v128_or( reg0, reg1 ) + #define stbir__simdi_16madd( out, reg0, reg1 ) (out) = wasm_i32x4_dot_i16x8( reg0, reg1 ) + + #define stbir__simdf_pack_to_8bytes(out,aa,bb) \ + { \ + v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \ + v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint8_as_float), wasm_f32x4_const_splat(0) ); \ + v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \ + v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \ + v128_t out16 = wasm_i16x8_narrow_i32x4( ai, bi ); \ + out = wasm_u8x16_narrow_i16x8( out16, out16 ); \ + } + + #define stbir__simdf_pack_to_8words(out,aa,bb) \ + { \ + v128_t af = wasm_f32x4_max( wasm_f32x4_min(aa, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \ + v128_t bf = wasm_f32x4_max( wasm_f32x4_min(bb, STBIR_max_uint16_as_float), wasm_f32x4_const_splat(0)); \ + v128_t ai = wasm_i32x4_trunc_sat_f32x4( af ); \ + v128_t bi = wasm_i32x4_trunc_sat_f32x4( bf ); \ + out = wasm_u16x8_narrow_i32x4( ai, bi ); \ + } + + #define stbir__interleave_pack_and_store_16_u8( ptr, r0, r1, r2, r3 ) \ + { \ + v128_t tmp0 = wasm_i16x8_narrow_i32x4(r0, r1); \ + v128_t tmp1 = wasm_i16x8_narrow_i32x4(r2, r3); \ + v128_t tmp = wasm_u8x16_narrow_i16x8(tmp0, tmp1); \ + tmp = wasm_i8x16_shuffle(tmp, tmp, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); \ + wasm_v128_store( (void*)(ptr), tmp); \ + } + + #define stbir__simdf_load4_transposed( o0, o1, o2, o3, ptr ) \ + { \ + v128_t t0 = wasm_v128_load( ptr ); \ + v128_t t1 = wasm_v128_load( ptr+4 ); \ + v128_t t2 = wasm_v128_load( ptr+8 ); \ + v128_t t3 = wasm_v128_load( ptr+12 ); \ + v128_t s0 = wasm_i32x4_shuffle(t0, t1, 0, 4, 2, 6); \ + v128_t s1 = wasm_i32x4_shuffle(t0, t1, 1, 5, 3, 7); \ + v128_t s2 = wasm_i32x4_shuffle(t2, t3, 0, 4, 2, 6); \ + v128_t s3 = wasm_i32x4_shuffle(t2, t3, 1, 5, 3, 7); \ + o0 = wasm_i32x4_shuffle(s0, s2, 0, 1, 4, 5); \ + o1 = wasm_i32x4_shuffle(s1, s3, 0, 1, 4, 5); \ + o2 = wasm_i32x4_shuffle(s0, s2, 2, 3, 6, 7); \ + o3 = wasm_i32x4_shuffle(s1, s3, 2, 3, 6, 7); \ + } + + #define stbir__simdi_32shr( out, reg, imm ) out = wasm_u32x4_shr( reg, imm ) + + typedef float stbir__f32x4 __attribute__((__vector_size__(16), __aligned__(16))); + #define STBIR__SIMDF_CONST(var, x) stbir__simdf var = (v128_t)(stbir__f32x4){ x, x, x, x } + #define STBIR__SIMDI_CONST(var, x) stbir__simdi var = { x, x, x, x } + #define STBIR__CONSTF(var) (var) + #define STBIR__CONSTI(var) (var) + + #ifdef STBIR_FLOORF + #undef STBIR_FLOORF + #endif + #define STBIR_FLOORF stbir_simd_floorf + static stbir__inline float stbir_simd_floorf(float x) + { + return wasm_f32x4_extract_lane( wasm_f32x4_floor( wasm_f32x4_splat(x) ), 0); + } + + #ifdef STBIR_CEILF + #undef STBIR_CEILF + #endif + #define STBIR_CEILF stbir_simd_ceilf + static stbir__inline float stbir_simd_ceilf(float x) + { + return wasm_f32x4_extract_lane( wasm_f32x4_ceil( wasm_f32x4_splat(x) ), 0); + } + + #define STBIR_SIMD + +#endif // SSE2/NEON/WASM + +#endif // NO SIMD + +#ifdef STBIR_SIMD8 + #define stbir__simdfX stbir__simdf8 + #define stbir__simdiX stbir__simdi8 + #define stbir__simdfX_load stbir__simdf8_load + #define stbir__simdiX_load stbir__simdi8_load + #define stbir__simdfX_mult stbir__simdf8_mult + #define stbir__simdfX_add_mem stbir__simdf8_add_mem + #define stbir__simdfX_madd_mem stbir__simdf8_madd_mem + #define stbir__simdfX_store stbir__simdf8_store + #define stbir__simdiX_store stbir__simdi8_store + #define stbir__simdf_frepX stbir__simdf8_frep8 + #define stbir__simdfX_madd stbir__simdf8_madd + #define stbir__simdfX_min stbir__simdf8_min + #define stbir__simdfX_max stbir__simdf8_max + #define stbir__simdfX_aaa1 stbir__simdf8_aaa1 + #define stbir__simdfX_1aaa stbir__simdf8_1aaa + #define stbir__simdfX_a1a1 stbir__simdf8_a1a1 + #define stbir__simdfX_1a1a stbir__simdf8_1a1a + #define stbir__simdfX_convert_float_to_i32 stbir__simdf8_convert_float_to_i32 + #define stbir__simdfX_pack_to_words stbir__simdf8_pack_to_16words + #define stbir__simdfX_zero stbir__simdf8_zero + #define STBIR_onesX STBIR_ones8 + #define STBIR_max_uint8_as_floatX STBIR_max_uint8_as_float8 + #define STBIR_max_uint16_as_floatX STBIR_max_uint16_as_float8 + #define STBIR_simd_point5X STBIR_simd_point58 + #define stbir__simdfX_float_count 8 + #define stbir__simdfX_0123to1230 stbir__simdf8_0123to12301230 + #define stbir__simdfX_0123to2103 stbir__simdf8_0123to21032103 + static const stbir__simdf8 STBIR_max_uint16_as_float_inverted8 = { stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted,stbir__max_uint16_as_float_inverted }; + static const stbir__simdf8 STBIR_max_uint8_as_float_inverted8 = { stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted,stbir__max_uint8_as_float_inverted }; + static const stbir__simdf8 STBIR_ones8 = { 1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0 }; + static const stbir__simdf8 STBIR_simd_point58 = { 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5 }; + static const stbir__simdf8 STBIR_max_uint8_as_float8 = { stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float, stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float,stbir__max_uint8_as_float }; + static const stbir__simdf8 STBIR_max_uint16_as_float8 = { stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float, stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float,stbir__max_uint16_as_float }; +#else + #define stbir__simdfX stbir__simdf + #define stbir__simdiX stbir__simdi + #define stbir__simdfX_load stbir__simdf_load + #define stbir__simdiX_load stbir__simdi_load + #define stbir__simdfX_mult stbir__simdf_mult + #define stbir__simdfX_add_mem stbir__simdf_add_mem + #define stbir__simdfX_madd_mem stbir__simdf_madd_mem + #define stbir__simdfX_store stbir__simdf_store + #define stbir__simdiX_store stbir__simdi_store + #define stbir__simdf_frepX stbir__simdf_frep4 + #define stbir__simdfX_madd stbir__simdf_madd + #define stbir__simdfX_min stbir__simdf_min + #define stbir__simdfX_max stbir__simdf_max + #define stbir__simdfX_aaa1 stbir__simdf_aaa1 + #define stbir__simdfX_1aaa stbir__simdf_1aaa + #define stbir__simdfX_a1a1 stbir__simdf_a1a1 + #define stbir__simdfX_1a1a stbir__simdf_1a1a + #define stbir__simdfX_convert_float_to_i32 stbir__simdf_convert_float_to_i32 + #define stbir__simdfX_pack_to_words stbir__simdf_pack_to_8words + #define stbir__simdfX_zero stbir__simdf_zero + #define STBIR_onesX STBIR__CONSTF(STBIR_ones) + #define STBIR_simd_point5X STBIR__CONSTF(STBIR_simd_point5) + #define STBIR_max_uint8_as_floatX STBIR__CONSTF(STBIR_max_uint8_as_float) + #define STBIR_max_uint16_as_floatX STBIR__CONSTF(STBIR_max_uint16_as_float) + #define stbir__simdfX_float_count 4 + #define stbir__if_simdf8_cast_to_simdf4( val ) ( val ) + #define stbir__simdfX_0123to1230 stbir__simdf_0123to1230 + #define stbir__simdfX_0123to2103 stbir__simdf_0123to2103 +#endif + + +#if defined(STBIR_NEON) && !defined(_M_ARM) && !defined(__arm__) + + #if defined( _MSC_VER ) && !defined(__clang__) + typedef __int16 stbir__FP16; + #else + typedef float16_t stbir__FP16; + #endif + +#else // no NEON, or 32-bit ARM for MSVC + + typedef union stbir__FP16 + { + unsigned short u; + } stbir__FP16; + +#endif + +#if (!defined(STBIR_NEON) && !defined(STBIR_FP16C)) || (defined(STBIR_NEON) && defined(_M_ARM)) || (defined(STBIR_NEON) && defined(__arm__)) + + // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668 + + static stbir__inline float stbir__half_to_float( stbir__FP16 h ) + { + static const stbir__FP32 magic = { (254 - 15) << 23 }; + static const stbir__FP32 was_infnan = { (127 + 16) << 23 }; + stbir__FP32 o; + + o.u = (h.u & 0x7fff) << 13; // exponent/mantissa bits + o.f *= magic.f; // exponent adjust + if (o.f >= was_infnan.f) // make sure Inf/NaN survive + o.u |= 255 << 23; + o.u |= (h.u & 0x8000) << 16; // sign bit + return o.f; + } + + static stbir__inline stbir__FP16 stbir__float_to_half(float val) + { + stbir__FP32 f32infty = { 255 << 23 }; + stbir__FP32 f16max = { (127 + 16) << 23 }; + stbir__FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + unsigned int sign_mask = 0x80000000u; + stbir__FP16 o = { 0 }; + stbir__FP32 f; + unsigned int sign; + + f.f = val; + sign = f.u & sign_mask; + f.u ^= sign; + + if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set) + o.u = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + else // (De)normalized number or zero + { + if (f.u < (113 << 23)) // resulting FP16 is subnormal or zero + { + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + // and one integer subtract of the bias later, we have our final float! + o.u = (unsigned short) ( f.u - denorm_magic.u ); + } + else + { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + // update exponent, rounding bias part 1 + f.u = f.u + ((15u - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.u = (unsigned short) ( f.u >> 13 ); + } + } + + o.u |= sign >> 16; + return o; + } + +#endif + + +#if defined(STBIR_FP16C) + + #include + + static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input) + { + _mm256_storeu_ps( (float*)output, _mm256_cvtph_ps( _mm_loadu_si128( (__m128i const* )input ) ) ); + } + + static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input) + { + _mm_storeu_si128( (__m128i*)output, _mm256_cvtps_ph( _mm256_loadu_ps( input ), 0 ) ); + } + + static stbir__inline float stbir__half_to_float( stbir__FP16 h ) + { + return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( (int)h.u ) ) ); + } + + static stbir__inline stbir__FP16 stbir__float_to_half( float f ) + { + stbir__FP16 h; + h.u = (unsigned short) _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( f ), 0 ) ); + return h; + } + +#elif defined(STBIR_SSE2) + + // Fabian's half float routines, see: https://gist.github.com/rygorous/2156668 + stbir__inline static void stbir__half_to_float_SIMD(float * output, void const * input) + { + static const STBIR__SIMDI_CONST(mask_nosign, 0x7fff); + static const STBIR__SIMDI_CONST(smallest_normal, 0x0400); + static const STBIR__SIMDI_CONST(infinity, 0x7c00); + static const STBIR__SIMDI_CONST(expadjust_normal, (127 - 15) << 23); + static const STBIR__SIMDI_CONST(magic_denorm, 113 << 23); + + __m128i i = _mm_loadu_si128 ( (__m128i const*)(input) ); + __m128i h = _mm_unpacklo_epi16 ( i, _mm_setzero_si128() ); + __m128i mnosign = STBIR__CONSTI(mask_nosign); + __m128i eadjust = STBIR__CONSTI(expadjust_normal); + __m128i smallest = STBIR__CONSTI(smallest_normal); + __m128i infty = STBIR__CONSTI(infinity); + __m128i expmant = _mm_and_si128(mnosign, h); + __m128i justsign = _mm_xor_si128(h, expmant); + __m128i b_notinfnan = _mm_cmpgt_epi32(infty, expmant); + __m128i b_isdenorm = _mm_cmpgt_epi32(smallest, expmant); + __m128i shifted = _mm_slli_epi32(expmant, 13); + __m128i adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust); + __m128i adjusted = _mm_add_epi32(eadjust, shifted); + __m128i den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm)); + __m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan); + __m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm); + __m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm)); + __m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2)); + __m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4); + __m128i sign = _mm_slli_epi32(justsign, 16); + __m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign)); + stbir__simdf_store( output + 0, final ); + + h = _mm_unpackhi_epi16 ( i, _mm_setzero_si128() ); + expmant = _mm_and_si128(mnosign, h); + justsign = _mm_xor_si128(h, expmant); + b_notinfnan = _mm_cmpgt_epi32(infty, expmant); + b_isdenorm = _mm_cmpgt_epi32(smallest, expmant); + shifted = _mm_slli_epi32(expmant, 13); + adj_infnan = _mm_andnot_si128(b_notinfnan, eadjust); + adjusted = _mm_add_epi32(eadjust, shifted); + den1 = _mm_add_epi32(shifted, STBIR__CONSTI(magic_denorm)); + adjusted2 = _mm_add_epi32(adjusted, adj_infnan); + den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm); + adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm)); + adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2)); + adjusted5 = _mm_or_ps(adjusted3, adjusted4); + sign = _mm_slli_epi32(justsign, 16); + final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign)); + stbir__simdf_store( output + 4, final ); + + // ~38 SSE2 ops for 8 values + } + + // Fabian's round-to-nearest-even float to half + // ~48 SSE2 ops for 8 output + stbir__inline static void stbir__float_to_half_SIMD(void * output, float const * input) + { + static const STBIR__SIMDI_CONST(mask_sign, 0x80000000u); + static const STBIR__SIMDI_CONST(c_f16max, (127 + 16) << 23); // all FP32 values >=this round to +inf + static const STBIR__SIMDI_CONST(c_nanbit, 0x200); + static const STBIR__SIMDI_CONST(c_infty_as_fp16, 0x7c00); + static const STBIR__SIMDI_CONST(c_min_normal, (127 - 14) << 23); // smallest FP32 that yields a normalized FP16 + static const STBIR__SIMDI_CONST(c_subnorm_magic, ((127 - 15) + (23 - 10) + 1) << 23); + static const STBIR__SIMDI_CONST(c_normal_bias, 0xfff - ((127 - 15) << 23)); // adjust exponent and add mantissa rounding + + __m128 f = _mm_loadu_ps(input); + __m128 msign = _mm_castsi128_ps(STBIR__CONSTI(mask_sign)); + __m128 justsign = _mm_and_ps(msign, f); + __m128 absf = _mm_xor_ps(f, justsign); + __m128i absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit) + __m128i f16max = STBIR__CONSTI(c_f16max); + __m128 b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN? + __m128i b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special? + __m128i nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), STBIR__CONSTI(c_nanbit)); + __m128i inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials + + __m128i min_normal = STBIR__CONSTI(c_min_normal); + __m128i b_issub = _mm_cmpgt_epi32(min_normal, absf_int); + + // "result is subnormal" path + __m128 subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa + __m128i subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias + + // "result is normal" path + __m128i mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign + __m128i mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0 + + __m128i round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias)); + __m128i round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE) + __m128i normal = _mm_srli_epi32(round2, 13); // rounded result + + // combine the two non-specials + __m128i nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal)); + + // merge in specials as well + __m128i joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan)); + + __m128i sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16); + __m128i final2, final= _mm_or_si128(joined, sign_shift); + + f = _mm_loadu_ps(input+4); + justsign = _mm_and_ps(msign, f); + absf = _mm_xor_ps(f, justsign); + absf_int = _mm_castps_si128(absf); // the cast is "free" (extra bypass latency, but no thruput hit) + b_isnan = _mm_cmpunord_ps(absf, absf); // is this a NaN? + b_isregular = _mm_cmpgt_epi32(f16max, absf_int); // (sub)normalized or special? + nanbit = _mm_and_si128(_mm_castps_si128(b_isnan), c_nanbit); + inf_or_nan = _mm_or_si128(nanbit, STBIR__CONSTI(c_infty_as_fp16)); // output for specials + + b_issub = _mm_cmpgt_epi32(min_normal, absf_int); + + // "result is subnormal" path + subnorm1 = _mm_add_ps(absf, _mm_castsi128_ps(STBIR__CONSTI(c_subnorm_magic))); // magic value to round output mantissa + subnorm2 = _mm_sub_epi32(_mm_castps_si128(subnorm1), STBIR__CONSTI(c_subnorm_magic)); // subtract out bias + + // "result is normal" path + mantoddbit = _mm_slli_epi32(absf_int, 31 - 13); // shift bit 13 (mantissa LSB) to sign + mantodd = _mm_srai_epi32(mantoddbit, 31); // -1 if FP16 mantissa odd, else 0 + + round1 = _mm_add_epi32(absf_int, STBIR__CONSTI(c_normal_bias)); + round2 = _mm_sub_epi32(round1, mantodd); // if mantissa LSB odd, bias towards rounding up (RTNE) + normal = _mm_srli_epi32(round2, 13); // rounded result + + // combine the two non-specials + nonspecial = _mm_or_si128(_mm_and_si128(subnorm2, b_issub), _mm_andnot_si128(b_issub, normal)); + + // merge in specials as well + joined = _mm_or_si128(_mm_and_si128(nonspecial, b_isregular), _mm_andnot_si128(b_isregular, inf_or_nan)); + + sign_shift = _mm_srai_epi32(_mm_castps_si128(justsign), 16); + final2 = _mm_or_si128(joined, sign_shift); + final = _mm_packs_epi32(final, final2); + stbir__simdi_store( output,final ); + } + +#elif defined(STBIR_NEON) && defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // 64-bit ARM on MSVC (not clang) + + static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input) + { + float16x4_t in0 = vld1_f16(input + 0); + float16x4_t in1 = vld1_f16(input + 4); + vst1q_f32(output + 0, vcvt_f32_f16(in0)); + vst1q_f32(output + 4, vcvt_f32_f16(in1)); + } + + static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input) + { + float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0)); + float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4)); + vst1_f16(output+0, out0); + vst1_f16(output+4, out1); + } + + static stbir__inline float stbir__half_to_float( stbir__FP16 h ) + { + return vgetq_lane_f32(vcvt_f32_f16(vld1_dup_f16(&h)), 0); + } + + static stbir__inline stbir__FP16 stbir__float_to_half( float f ) + { + return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0).n16_u16[0]; + } + +#elif defined(STBIR_NEON) && ( defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) ) // 64-bit ARM + + static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input) + { + float16x8_t in = vld1q_f16(input); + vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(in))); + vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(in))); + } + + static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input) + { + float16x4_t out0 = vcvt_f16_f32(vld1q_f32(input + 0)); + float16x4_t out1 = vcvt_f16_f32(vld1q_f32(input + 4)); + vst1q_f16(output, vcombine_f16(out0, out1)); + } + + static stbir__inline float stbir__half_to_float( stbir__FP16 h ) + { + return vgetq_lane_f32(vcvt_f32_f16(vdup_n_f16(h)), 0); + } + + static stbir__inline stbir__FP16 stbir__float_to_half( float f ) + { + return vget_lane_f16(vcvt_f16_f32(vdupq_n_f32(f)), 0); + } + +#elif defined(STBIR_WASM) || (defined(STBIR_NEON) && (defined(_MSC_VER) || defined(_M_ARM) || defined(__arm__))) // WASM or 32-bit ARM on MSVC/clang + + static stbir__inline void stbir__half_to_float_SIMD(float * output, stbir__FP16 const * input) + { + for (int i=0; i<8; i++) + { + output[i] = stbir__half_to_float(input[i]); + } + } + static stbir__inline void stbir__float_to_half_SIMD(stbir__FP16 * output, float const * input) + { + for (int i=0; i<8; i++) + { + output[i] = stbir__float_to_half(input[i]); + } + } + +#endif + + +#ifdef STBIR_SIMD + +#define stbir__simdf_0123to3333( out, reg ) (out) = stbir__simdf_swiz( reg, 3,3,3,3 ) +#define stbir__simdf_0123to2222( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,2,2 ) +#define stbir__simdf_0123to1111( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,1,1 ) +#define stbir__simdf_0123to0000( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,0 ) +#define stbir__simdf_0123to0003( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,3 ) +#define stbir__simdf_0123to0001( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,0,1 ) +#define stbir__simdf_0123to1122( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,2,2 ) +#define stbir__simdf_0123to2333( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,3,3 ) +#define stbir__simdf_0123to0023( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,3 ) +#define stbir__simdf_0123to1230( out, reg ) (out) = stbir__simdf_swiz( reg, 1,2,3,0 ) +#define stbir__simdf_0123to2103( out, reg ) (out) = stbir__simdf_swiz( reg, 2,1,0,3 ) +#define stbir__simdf_0123to3210( out, reg ) (out) = stbir__simdf_swiz( reg, 3,2,1,0 ) +#define stbir__simdf_0123to2301( out, reg ) (out) = stbir__simdf_swiz( reg, 2,3,0,1 ) +#define stbir__simdf_0123to3012( out, reg ) (out) = stbir__simdf_swiz( reg, 3,0,1,2 ) +#define stbir__simdf_0123to0011( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,1,1 ) +#define stbir__simdf_0123to1100( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,0,0 ) +#define stbir__simdf_0123to2233( out, reg ) (out) = stbir__simdf_swiz( reg, 2,2,3,3 ) +#define stbir__simdf_0123to1133( out, reg ) (out) = stbir__simdf_swiz( reg, 1,1,3,3 ) +#define stbir__simdf_0123to0022( out, reg ) (out) = stbir__simdf_swiz( reg, 0,0,2,2 ) +#define stbir__simdf_0123to1032( out, reg ) (out) = stbir__simdf_swiz( reg, 1,0,3,2 ) + +typedef union stbir__simdi_u32 +{ + stbir_uint32 m128i_u32[4]; + int m128i_i32[4]; + stbir__simdi m128i_i128; +} stbir__simdi_u32; + +static const int STBIR_mask[9] = { 0,0,0,-1,-1,-1,0,0,0 }; + +static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float, stbir__max_uint8_as_float); +static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float, stbir__max_uint16_as_float); +static const STBIR__SIMDF_CONST(STBIR_max_uint8_as_float_inverted, stbir__max_uint8_as_float_inverted); +static const STBIR__SIMDF_CONST(STBIR_max_uint16_as_float_inverted, stbir__max_uint16_as_float_inverted); + +static const STBIR__SIMDF_CONST(STBIR_simd_point5, 0.5f); +static const STBIR__SIMDF_CONST(STBIR_ones, 1.0f); +static const STBIR__SIMDI_CONST(STBIR_almost_zero, (127 - 13) << 23); +static const STBIR__SIMDI_CONST(STBIR_almost_one, 0x3f7fffff); +static const STBIR__SIMDI_CONST(STBIR_mastissa_mask, 0xff); +static const STBIR__SIMDI_CONST(STBIR_topscale, 0x02000000); + +// Basically, in simd mode, we unroll the proper amount, and we don't want +// the non-simd remnant loops to be unroll because they only run a few times +// Adding this switch saves about 5K on clang which is Captain Unroll the 3rd. +#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star ) +#define STBIR_SIMD_NO_UNROLL(ptr) STBIR_NO_UNROLL(ptr) +#define STBIR_SIMD_NO_UNROLL_LOOP_START STBIR_NO_UNROLL_LOOP_START +#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR STBIR_NO_UNROLL_LOOP_START_INF_FOR + +#ifdef STBIR_MEMCPY +#undef STBIR_MEMCPY +#endif +#define STBIR_MEMCPY stbir_simd_memcpy + +// override normal use of memcpy with much simpler copy (faster and smaller with our sized copies) +static void stbir_simd_memcpy( void * dest, void const * src, size_t bytes ) +{ + char STBIR_SIMD_STREAMOUT_PTR (*) d = (char*) dest; + char STBIR_SIMD_STREAMOUT_PTR( * ) d_end = ((char*) dest) + bytes; + ptrdiff_t ofs_to_src = (char*)src - (char*)dest; + + // check overlaps + STBIR_ASSERT( ( ( d >= ( (char*)src) + bytes ) ) || ( ( d + bytes ) <= (char*)src ) ); + + if ( bytes < (16*stbir__simdfX_float_count) ) + { + if ( bytes < 16 ) + { + if ( bytes ) + { + STBIR_SIMD_NO_UNROLL_LOOP_START + do + { + STBIR_SIMD_NO_UNROLL(d); + d[ 0 ] = d[ ofs_to_src ]; + ++d; + } while ( d < d_end ); + } + } + else + { + stbir__simdf x; + // do one unaligned to get us aligned for the stream out below + stbir__simdf_load( x, ( d + ofs_to_src ) ); + stbir__simdf_store( d, x ); + d = (char*)( ( ( (size_t)d ) + 16 ) & ~15 ); + + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + STBIR_SIMD_NO_UNROLL(d); + + if ( d > ( d_end - 16 ) ) + { + if ( d == d_end ) + return; + d = d_end - 16; + } + + stbir__simdf_load( x, ( d + ofs_to_src ) ); + stbir__simdf_store( d, x ); + d += 16; + } + } + } + else + { + stbir__simdfX x0,x1,x2,x3; + + // do one unaligned to get us aligned for the stream out below + stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count ); + stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count ); + stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count ); + stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count ); + stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 ); + stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 ); + stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 ); + stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 ); + d = (char*)( ( ( (size_t)d ) + (16*stbir__simdfX_float_count) ) & ~((16*stbir__simdfX_float_count)-1) ); + + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + STBIR_SIMD_NO_UNROLL(d); + + if ( d > ( d_end - (16*stbir__simdfX_float_count) ) ) + { + if ( d == d_end ) + return; + d = d_end - (16*stbir__simdfX_float_count); + } + + stbir__simdfX_load( x0, ( d + ofs_to_src ) + 0*stbir__simdfX_float_count ); + stbir__simdfX_load( x1, ( d + ofs_to_src ) + 4*stbir__simdfX_float_count ); + stbir__simdfX_load( x2, ( d + ofs_to_src ) + 8*stbir__simdfX_float_count ); + stbir__simdfX_load( x3, ( d + ofs_to_src ) + 12*stbir__simdfX_float_count ); + stbir__simdfX_store( d + 0*stbir__simdfX_float_count, x0 ); + stbir__simdfX_store( d + 4*stbir__simdfX_float_count, x1 ); + stbir__simdfX_store( d + 8*stbir__simdfX_float_count, x2 ); + stbir__simdfX_store( d + 12*stbir__simdfX_float_count, x3 ); + d += (16*stbir__simdfX_float_count); + } + } +} + +// memcpy that is specically intentionally overlapping (src is smaller then dest, so can be +// a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to +// the diff between dest and src) +static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) +{ + char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src; + char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes; + ptrdiff_t ofs_to_dest = (char*)dest - (char*)src; + + if ( ofs_to_dest >= 16 ) // is the overlap more than 16 away? + { + char STBIR_SIMD_STREAMOUT_PTR( * ) s_end16 = ((char*) src) + (bytes&~15); + STBIR_SIMD_NO_UNROLL_LOOP_START + do + { + stbir__simdf x; + STBIR_SIMD_NO_UNROLL(sd); + stbir__simdf_load( x, sd ); + stbir__simdf_store( ( sd + ofs_to_dest ), x ); + sd += 16; + } while ( sd < s_end16 ); + + if ( sd == s_end ) + return; + } + + do + { + STBIR_SIMD_NO_UNROLL(sd); + *(int*)( sd + ofs_to_dest ) = *(int*) sd; + sd += 4; + } while ( sd < s_end ); +} + +#else // no SSE2 + +// when in scalar mode, we let unrolling happen, so this macro just does the __restrict +#define STBIR_SIMD_STREAMOUT_PTR( star ) STBIR_STREAMOUT_PTR( star ) +#define STBIR_SIMD_NO_UNROLL(ptr) +#define STBIR_SIMD_NO_UNROLL_LOOP_START +#define STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + +#endif // SSE2 + + +#ifdef STBIR_PROFILE + +#ifndef STBIR_PROFILE_FUNC + +#if defined(_x86_64) || defined( __x86_64__ ) || defined( _M_X64 ) || defined(__x86_64) || defined(__SSE2__) || defined(STBIR_SSE) || defined( _M_IX86_FP ) || defined(__i386) || defined( __i386__ ) || defined( _M_IX86 ) || defined( _X86_ ) + +#ifdef _MSC_VER + + STBIRDEF stbir_uint64 __rdtsc(); + #define STBIR_PROFILE_FUNC() __rdtsc() + +#else // non msvc + + static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() + { + stbir_uint32 lo, hi; + asm volatile ("rdtsc" : "=a" (lo), "=d" (hi) ); + return ( ( (stbir_uint64) hi ) << 32 ) | ( (stbir_uint64) lo ); + } + +#endif // msvc + +#elif defined( _M_ARM64 ) || defined( __aarch64__ ) || defined( __arm64__ ) || defined(__ARM_NEON__) + +#if defined( _MSC_VER ) && !defined(__clang__) + + #define STBIR_PROFILE_FUNC() _ReadStatusReg(ARM64_CNTVCT) + +#else + + static stbir__inline stbir_uint64 STBIR_PROFILE_FUNC() + { + stbir_uint64 tsc; + asm volatile("mrs %0, cntvct_el0" : "=r" (tsc)); + return tsc; + } + +#endif + +#else // x64, arm + +#error Unknown platform for profiling. + +#endif // x64, arm + +#endif // STBIR_PROFILE_FUNC + +#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO ,stbir__per_split_info * split_info +#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO ,split_info + +#define STBIR_ONLY_PROFILE_BUILD_GET_INFO ,stbir__info * profile_info +#define STBIR_ONLY_PROFILE_BUILD_SET_INFO ,profile_info + +// super light-weight micro profiler +#define STBIR_PROFILE_START_ll( info, wh ) { stbir_uint64 wh##thiszonetime = STBIR_PROFILE_FUNC(); stbir_uint64 * wh##save_parent_excluded_ptr = info->current_zone_excluded_ptr; stbir_uint64 wh##current_zone_excluded = 0; info->current_zone_excluded_ptr = &wh##current_zone_excluded; +#define STBIR_PROFILE_END_ll( info, wh ) wh##thiszonetime = STBIR_PROFILE_FUNC() - wh##thiszonetime; info->profile.named.wh += wh##thiszonetime - wh##current_zone_excluded; *wh##save_parent_excluded_ptr += wh##thiszonetime; info->current_zone_excluded_ptr = wh##save_parent_excluded_ptr; } +#define STBIR_PROFILE_FIRST_START_ll( info, wh ) { int i; info->current_zone_excluded_ptr = &info->profile.named.total; for(i=0;iprofile.array);i++) info->profile.array[i]=0; } STBIR_PROFILE_START_ll( info, wh ); +#define STBIR_PROFILE_CLEAR_EXTRAS_ll( info, num ) { int extra; for(extra=1;extra<(num);extra++) { int i; for(i=0;iprofile.array);i++) (info)[extra].profile.array[i]=0; } } + +// for thread data +#define STBIR_PROFILE_START( wh ) STBIR_PROFILE_START_ll( split_info, wh ) +#define STBIR_PROFILE_END( wh ) STBIR_PROFILE_END_ll( split_info, wh ) +#define STBIR_PROFILE_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( split_info, wh ) +#define STBIR_PROFILE_CLEAR_EXTRAS() STBIR_PROFILE_CLEAR_EXTRAS_ll( split_info, split_count ) + +// for build data +#define STBIR_PROFILE_BUILD_START( wh ) STBIR_PROFILE_START_ll( profile_info, wh ) +#define STBIR_PROFILE_BUILD_END( wh ) STBIR_PROFILE_END_ll( profile_info, wh ) +#define STBIR_PROFILE_BUILD_FIRST_START( wh ) STBIR_PROFILE_FIRST_START_ll( profile_info, wh ) +#define STBIR_PROFILE_BUILD_CLEAR( info ) { int i; for(i=0;iprofile.array);i++) info->profile.array[i]=0; } + +#else // no profile + +#define STBIR_ONLY_PROFILE_GET_SPLIT_INFO +#define STBIR_ONLY_PROFILE_SET_SPLIT_INFO + +#define STBIR_ONLY_PROFILE_BUILD_GET_INFO +#define STBIR_ONLY_PROFILE_BUILD_SET_INFO + +#define STBIR_PROFILE_START( wh ) +#define STBIR_PROFILE_END( wh ) +#define STBIR_PROFILE_FIRST_START( wh ) +#define STBIR_PROFILE_CLEAR_EXTRAS( ) + +#define STBIR_PROFILE_BUILD_START( wh ) +#define STBIR_PROFILE_BUILD_END( wh ) +#define STBIR_PROFILE_BUILD_FIRST_START( wh ) +#define STBIR_PROFILE_BUILD_CLEAR( info ) + +#endif // stbir_profile + +#ifndef STBIR_CEILF +#include +#if _MSC_VER <= 1200 // support VC6 for Sean +#define STBIR_CEILF(x) ((float)ceil((float)(x))) +#define STBIR_FLOORF(x) ((float)floor((float)(x))) +#else +#define STBIR_CEILF(x) ceilf(x) +#define STBIR_FLOORF(x) floorf(x) +#endif +#endif + +#ifndef STBIR_MEMCPY +// For memcpy +#include +#define STBIR_MEMCPY( dest, src, len ) memcpy( dest, src, len ) +#endif + +#ifndef STBIR_SIMD + +// memcpy that is specifically intentionally overlapping (src is smaller then dest, so can be +// a normal forward copy, bytes is divisible by 4 and bytes is greater than or equal to +// the diff between dest and src) +static void stbir_overlapping_memcpy( void * dest, void const * src, size_t bytes ) +{ + char STBIR_SIMD_STREAMOUT_PTR (*) sd = (char*) src; + char STBIR_SIMD_STREAMOUT_PTR( * ) s_end = ((char*) src) + bytes; + ptrdiff_t ofs_to_dest = (char*)dest - (char*)src; + + if ( ofs_to_dest >= 8 ) // is the overlap more than 8 away? + { + char STBIR_SIMD_STREAMOUT_PTR( * ) s_end8 = ((char*) src) + (bytes&~7); + STBIR_NO_UNROLL_LOOP_START + do + { + STBIR_NO_UNROLL(sd); + *(stbir_uint64*)( sd + ofs_to_dest ) = *(stbir_uint64*) sd; + sd += 8; + } while ( sd < s_end8 ); + + if ( sd == s_end ) + return; + } + + STBIR_NO_UNROLL_LOOP_START + do + { + STBIR_NO_UNROLL(sd); + *(int*)( sd + ofs_to_dest ) = *(int*) sd; + sd += 4; + } while ( sd < s_end ); +} + +#endif + +static float stbir__filter_trapezoid(float x, float scale, void * user_data) +{ + float halfscale = scale / 2; + float t = 0.5f + halfscale; + STBIR_ASSERT(scale <= 1); + STBIR__UNUSED(user_data); + + if ( x < 0.0f ) x = -x; + + if (x >= t) + return 0.0f; + else + { + float r = 0.5f - halfscale; + if (x <= r) + return 1.0f; + else + return (t - x) / scale; + } +} + +static float stbir__support_trapezoid(float scale, void * user_data) +{ + STBIR__UNUSED(user_data); + return 0.5f + scale / 2.0f; +} + +static float stbir__filter_triangle(float x, float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + + if ( x < 0.0f ) x = -x; + + if (x <= 1.0f) + return 1.0f - x; + else + return 0.0f; +} + +static float stbir__filter_point(float x, float s, void * user_data) +{ + STBIR__UNUSED(x); + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + + return 1.0f; +} + +static float stbir__filter_cubic(float x, float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + + if ( x < 0.0f ) x = -x; + + if (x < 1.0f) + return (4.0f + x*x*(3.0f*x - 6.0f))/6.0f; + else if (x < 2.0f) + return (8.0f + x*(-12.0f + x*(6.0f - x)))/6.0f; + + return (0.0f); +} + +static float stbir__filter_catmullrom(float x, float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + + if ( x < 0.0f ) x = -x; + + if (x < 1.0f) + return 1.0f - x*x*(2.5f - 1.5f*x); + else if (x < 2.0f) + return 2.0f - x*(4.0f + x*(0.5f*x - 2.5f)); + + return (0.0f); +} + +static float stbir__filter_mitchell(float x, float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + + if ( x < 0.0f ) x = -x; + + if (x < 1.0f) + return (16.0f + x*x*(21.0f * x - 36.0f))/18.0f; + else if (x < 2.0f) + return (32.0f + x*(-60.0f + x*(36.0f - 7.0f*x)))/18.0f; + + return (0.0f); +} + +static float stbir__support_zeropoint5(float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + return 0.5f; +} + +static float stbir__support_one(float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + return 1; +} + +static float stbir__support_two(float s, void * user_data) +{ + STBIR__UNUSED(s); + STBIR__UNUSED(user_data); + return 2; +} + +// This is the maximum number of input samples that can affect an output sample +// with the given filter from the output pixel's perspective +static int stbir__get_filter_pixel_width(stbir__support_callback * support, float scale, void * user_data) +{ + STBIR_ASSERT(support != 0); + + if ( scale >= ( 1.0f-stbir__small_float ) ) // upscale + return (int)STBIR_CEILF(support(1.0f/scale,user_data) * 2.0f); + else + return (int)STBIR_CEILF(support(scale,user_data) * 2.0f / scale); +} + +// this is how many coefficents per run of the filter (which is different +// from the filter_pixel_width depending on if we are scattering or gathering) +static int stbir__get_coefficient_width(stbir__sampler * samp, int is_gather, void * user_data) +{ + float scale = samp->scale_info.scale; + stbir__support_callback * support = samp->filter_support; + + switch( is_gather ) + { + case 1: + return (int)STBIR_CEILF(support(1.0f / scale, user_data) * 2.0f); + case 2: + return (int)STBIR_CEILF(support(scale, user_data) * 2.0f / scale); + case 0: + return (int)STBIR_CEILF(support(scale, user_data) * 2.0f); + default: + STBIR_ASSERT( (is_gather >= 0 ) && (is_gather <= 2 ) ); + return 0; + } +} + +static int stbir__get_contributors(stbir__sampler * samp, int is_gather) +{ + if (is_gather) + return samp->scale_info.output_sub_size; + else + return (samp->scale_info.input_full_size + samp->filter_pixel_margin * 2); +} + +static int stbir__edge_zero_full( int n, int max ) +{ + STBIR__UNUSED(n); + STBIR__UNUSED(max); + return 0; // NOTREACHED +} + +static int stbir__edge_clamp_full( int n, int max ) +{ + if (n < 0) + return 0; + + if (n >= max) + return max - 1; + + return n; // NOTREACHED +} + +static int stbir__edge_reflect_full( int n, int max ) +{ + if (n < 0) + { + if (n > -max) + return -n; + else + return max - 1; + } + + if (n >= max) + { + int max2 = max * 2; + if (n >= max2) + return 0; + else + return max2 - n - 1; + } + + return n; // NOTREACHED +} + +static int stbir__edge_wrap_full( int n, int max ) +{ + if (n >= 0) + return (n % max); + else + { + int m = (-n) % max; + + if (m != 0) + m = max - m; + + return (m); + } +} + +typedef int stbir__edge_wrap_func( int n, int max ); +static stbir__edge_wrap_func * stbir__edge_wrap_slow[] = +{ + stbir__edge_clamp_full, // STBIR_EDGE_CLAMP + stbir__edge_reflect_full, // STBIR_EDGE_REFLECT + stbir__edge_wrap_full, // STBIR_EDGE_WRAP + stbir__edge_zero_full, // STBIR_EDGE_ZERO +}; + +stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max) +{ + // avoid per-pixel switch + if (n >= 0 && n < max) + return n; + return stbir__edge_wrap_slow[edge]( n, max ); +} + +#define STBIR__MERGE_RUNS_PIXEL_THRESHOLD 16 + +// get information on the extents of a sampler +static void stbir__get_extents( stbir__sampler * samp, stbir__extents * scanline_extents ) +{ + int j, stop; + int left_margin, right_margin; + int min_n = 0x7fffffff, max_n = -0x7fffffff; + int min_left = 0x7fffffff, max_left = -0x7fffffff; + int min_right = 0x7fffffff, max_right = -0x7fffffff; + stbir_edge edge = samp->edge; + stbir__contributors* contributors = samp->contributors; + int output_sub_size = samp->scale_info.output_sub_size; + int input_full_size = samp->scale_info.input_full_size; + int filter_pixel_margin = samp->filter_pixel_margin; + + STBIR_ASSERT( samp->is_gather ); + + stop = output_sub_size; + for (j = 0; j < stop; j++ ) + { + STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 ); + if ( contributors[j].n0 < min_n ) + { + min_n = contributors[j].n0; + stop = j + filter_pixel_margin; // if we find a new min, only scan another filter width + if ( stop > output_sub_size ) stop = output_sub_size; + } + } + + stop = 0; + for (j = output_sub_size - 1; j >= stop; j-- ) + { + STBIR_ASSERT( contributors[j].n1 >= contributors[j].n0 ); + if ( contributors[j].n1 > max_n ) + { + max_n = contributors[j].n1; + stop = j - filter_pixel_margin; // if we find a new max, only scan another filter width + if (stop<0) stop = 0; + } + } + + STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n ); + STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n ); + + // now calculate how much into the margins we really read + left_margin = 0; + if ( min_n < 0 ) + { + left_margin = -min_n; + min_n = 0; + } + + right_margin = 0; + if ( max_n >= input_full_size ) + { + right_margin = max_n - input_full_size + 1; + max_n = input_full_size - 1; + } + + // index 1 is margin pixel extents (how many pixels we hang over the edge) + scanline_extents->edge_sizes[0] = left_margin; + scanline_extents->edge_sizes[1] = right_margin; + + // index 2 is pixels read from the input + scanline_extents->spans[0].n0 = min_n; + scanline_extents->spans[0].n1 = max_n; + scanline_extents->spans[0].pixel_offset_for_input = min_n; + + // default to no other input range + scanline_extents->spans[1].n0 = 0; + scanline_extents->spans[1].n1 = -1; + scanline_extents->spans[1].pixel_offset_for_input = 0; + + // don't have to do edge calc for zero clamp + if ( edge == STBIR_EDGE_ZERO ) + return; + + // convert margin pixels to the pixels within the input (min and max) + for( j = -left_margin ; j < 0 ; j++ ) + { + int p = stbir__edge_wrap( edge, j, input_full_size ); + if ( p < min_left ) + min_left = p; + if ( p > max_left ) + max_left = p; + } + + for( j = input_full_size ; j < (input_full_size + right_margin) ; j++ ) + { + int p = stbir__edge_wrap( edge, j, input_full_size ); + if ( p < min_right ) + min_right = p; + if ( p > max_right ) + max_right = p; + } + + // merge the left margin pixel region if it connects within 4 pixels of main pixel region + if ( min_left != 0x7fffffff ) + { + if ( ( ( min_left <= min_n ) && ( ( max_left + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) || + ( ( min_n <= min_left ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_left ) ) ) + { + scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_left ); + scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_left ); + scanline_extents->spans[0].pixel_offset_for_input = min_n; + left_margin = 0; + } + } + + // merge the right margin pixel region if it connects within 4 pixels of main pixel region + if ( min_right != 0x7fffffff ) + { + if ( ( ( min_right <= min_n ) && ( ( max_right + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= min_n ) ) || + ( ( min_n <= min_right ) && ( ( max_n + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= max_right ) ) ) + { + scanline_extents->spans[0].n0 = min_n = stbir__min( min_n, min_right ); + scanline_extents->spans[0].n1 = max_n = stbir__max( max_n, max_right ); + scanline_extents->spans[0].pixel_offset_for_input = min_n; + right_margin = 0; + } + } + + STBIR_ASSERT( scanline_extents->conservative.n0 <= min_n ); + STBIR_ASSERT( scanline_extents->conservative.n1 >= max_n ); + + // you get two ranges when you have the WRAP edge mode and you are doing just the a piece of the resize + // so you need to get a second run of pixels from the opposite side of the scanline (which you + // wouldn't need except for WRAP) + + + // if we can't merge the min_left range, add it as a second range + if ( ( left_margin ) && ( min_left != 0x7fffffff ) ) + { + stbir__span * newspan = scanline_extents->spans + 1; + STBIR_ASSERT( right_margin == 0 ); + if ( min_left < scanline_extents->spans[0].n0 ) + { + scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0; + scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0; + scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1; + --newspan; + } + newspan->pixel_offset_for_input = min_left; + newspan->n0 = -left_margin; + newspan->n1 = ( max_left - min_left ) - left_margin; + scanline_extents->edge_sizes[0] = 0; // don't need to copy the left margin, since we are directly decoding into the margin + } + // if we can't merge the min_left range, add it as a second range + else + if ( ( right_margin ) && ( min_right != 0x7fffffff ) ) + { + stbir__span * newspan = scanline_extents->spans + 1; + if ( min_right < scanline_extents->spans[0].n0 ) + { + scanline_extents->spans[1].pixel_offset_for_input = scanline_extents->spans[0].n0; + scanline_extents->spans[1].n0 = scanline_extents->spans[0].n0; + scanline_extents->spans[1].n1 = scanline_extents->spans[0].n1; + --newspan; + } + newspan->pixel_offset_for_input = min_right; + newspan->n0 = scanline_extents->spans[1].n1 + 1; + newspan->n1 = scanline_extents->spans[1].n1 + 1 + ( max_right - min_right ); + scanline_extents->edge_sizes[1] = 0; // don't need to copy the right margin, since we are directly decoding into the margin + } + + // sort the spans into write output order + if ( ( scanline_extents->spans[1].n1 > scanline_extents->spans[1].n0 ) && ( scanline_extents->spans[0].n0 > scanline_extents->spans[1].n0 ) ) + { + stbir__span tspan = scanline_extents->spans[0]; + scanline_extents->spans[0] = scanline_extents->spans[1]; + scanline_extents->spans[1] = tspan; + } +} + +static void stbir__calculate_in_pixel_range( int * first_pixel, int * last_pixel, float out_pixel_center, float out_filter_radius, float inv_scale, float out_shift, int input_size, stbir_edge edge ) +{ + int first, last; + float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius; + float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius; + + float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) * inv_scale; + float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) * inv_scale; + + first = (int)(STBIR_FLOORF(in_pixel_influence_lowerbound + 0.5f)); + last = (int)(STBIR_FLOORF(in_pixel_influence_upperbound - 0.5f)); + if ( last < first ) last = first; // point sample mode can span a value *right* at 0.5, and cause these to cross + + if ( edge == STBIR_EDGE_WRAP ) + { + if ( first < -input_size ) + first = -input_size; + if ( last >= (input_size*2)) + last = (input_size*2) - 1; + } + + *first_pixel = first; + *last_pixel = last; +} + +static void stbir__calculate_coefficients_for_gather_upsample( float out_filter_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float* coefficient_group, int coefficient_width, stbir_edge edge, void * user_data ) +{ + int n, end; + float inv_scale = scale_info->inv_scale; + float out_shift = scale_info->pixel_shift; + int input_size = scale_info->input_full_size; + int numerator = scale_info->scale_numerator; + int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) ); + + // Looping through out pixels + end = num_contributors; if ( polyphase ) end = numerator; + for (n = 0; n < end; n++) + { + int i; + int last_non_zero; + float out_pixel_center = (float)n + 0.5f; + float in_center_of_out = (out_pixel_center + out_shift) * inv_scale; + + int in_first_pixel, in_last_pixel; + + stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, out_pixel_center, out_filter_radius, inv_scale, out_shift, input_size, edge ); + + // make sure we never generate a range larger than our precalculated coeff width + // this only happens in point sample mode, but it's a good safe thing to do anyway + if ( ( in_last_pixel - in_first_pixel + 1 ) > coefficient_width ) + in_last_pixel = in_first_pixel + coefficient_width - 1; + + last_non_zero = -1; + for (i = 0; i <= in_last_pixel - in_first_pixel; i++) + { + float in_pixel_center = (float)(i + in_first_pixel) + 0.5f; + float coeff = kernel(in_center_of_out - in_pixel_center, inv_scale, user_data); + + // kill denormals + if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) ) + { + if ( i == 0 ) // if we're at the front, just eat zero contributors + { + STBIR_ASSERT ( ( in_last_pixel - in_first_pixel ) != 0 ); // there should be at least one contrib + ++in_first_pixel; + i--; + continue; + } + coeff = 0; // make sure is fully zero (should keep denormals away) + } + else + last_non_zero = i; + + coefficient_group[i] = coeff; + } + + in_last_pixel = last_non_zero+in_first_pixel; // kills trailing zeros + contributors->n0 = in_first_pixel; + contributors->n1 = in_last_pixel; + + STBIR_ASSERT(contributors->n1 >= contributors->n0); + + ++contributors; + coefficient_group += coefficient_width; + } +} + +static void stbir__insert_coeff( stbir__contributors * contribs, float * coeffs, int new_pixel, float new_coeff, int max_width ) +{ + if ( new_pixel <= contribs->n1 ) // before the end + { + if ( new_pixel < contribs->n0 ) // before the front? + { + if ( ( contribs->n1 - new_pixel + 1 ) <= max_width ) + { + int j, o = contribs->n0 - new_pixel; + for ( j = contribs->n1 - contribs->n0 ; j <= 0 ; j-- ) + coeffs[ j + o ] = coeffs[ j ]; + for ( j = 1 ; j < o ; j-- ) + coeffs[ j ] = coeffs[ 0 ]; + coeffs[ 0 ] = new_coeff; + contribs->n0 = new_pixel; + } + } + else + { + coeffs[ new_pixel - contribs->n0 ] += new_coeff; + } + } + else + { + if ( ( new_pixel - contribs->n0 + 1 ) <= max_width ) + { + int j, e = new_pixel - contribs->n0; + for( j = ( contribs->n1 - contribs->n0 ) + 1 ; j < e ; j++ ) // clear in-betweens coeffs if there are any + coeffs[j] = 0; + + coeffs[ e ] = new_coeff; + contribs->n1 = new_pixel; + } + } +} + +static void stbir__calculate_out_pixel_range( int * first_pixel, int * last_pixel, float in_pixel_center, float in_pixels_radius, float scale, float out_shift, int out_size ) +{ + float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius; + float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius; + float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale - out_shift; + float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale - out_shift; + int out_first_pixel = (int)(STBIR_FLOORF(out_pixel_influence_lowerbound + 0.5f)); + int out_last_pixel = (int)(STBIR_FLOORF(out_pixel_influence_upperbound - 0.5f)); + + if ( out_first_pixel < 0 ) + out_first_pixel = 0; + if ( out_last_pixel >= out_size ) + out_last_pixel = out_size - 1; + *first_pixel = out_first_pixel; + *last_pixel = out_last_pixel; +} + +static void stbir__calculate_coefficients_for_gather_downsample( int start, int end, float in_pixels_radius, stbir__kernel_callback * kernel, stbir__scale_info * scale_info, int coefficient_width, int num_contributors, stbir__contributors * contributors, float * coefficient_group, void * user_data ) +{ + int in_pixel; + int i; + int first_out_inited = -1; + float scale = scale_info->scale; + float out_shift = scale_info->pixel_shift; + int out_size = scale_info->output_sub_size; + int numerator = scale_info->scale_numerator; + int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < out_size ) ); + + STBIR__UNUSED(num_contributors); + + // Loop through the input pixels + for (in_pixel = start; in_pixel < end; in_pixel++) + { + float in_pixel_center = (float)in_pixel + 0.5f; + float out_center_of_in = in_pixel_center * scale - out_shift; + int out_first_pixel, out_last_pixel; + + stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, in_pixel_center, in_pixels_radius, scale, out_shift, out_size ); + + if ( out_first_pixel > out_last_pixel ) + continue; + + // clamp or exit if we are using polyphase filtering, and the limit is up + if ( polyphase ) + { + // when polyphase, you only have to do coeffs up to the numerator count + if ( out_first_pixel == numerator ) + break; + + // don't do any extra work, clamp last pixel at numerator too + if ( out_last_pixel >= numerator ) + out_last_pixel = numerator - 1; + } + + for (i = 0; i <= out_last_pixel - out_first_pixel; i++) + { + float out_pixel_center = (float)(i + out_first_pixel) + 0.5f; + float x = out_pixel_center - out_center_of_in; + float coeff = kernel(x, scale, user_data) * scale; + + // kill the coeff if it's too small (avoid denormals) + if ( ( ( coeff < stbir__small_float ) && ( coeff > -stbir__small_float ) ) ) + coeff = 0.0f; + + { + int out = i + out_first_pixel; + float * coeffs = coefficient_group + out * coefficient_width; + stbir__contributors * contribs = contributors + out; + + // is this the first time this output pixel has been seen? Init it. + if ( out > first_out_inited ) + { + STBIR_ASSERT( out == ( first_out_inited + 1 ) ); // ensure we have only advanced one at time + first_out_inited = out; + contribs->n0 = in_pixel; + contribs->n1 = in_pixel; + coeffs[0] = coeff; + } + else + { + // insert on end (always in order) + if ( coeffs[0] == 0.0f ) // if the first coefficent is zero, then zap it for this coeffs + { + STBIR_ASSERT( ( in_pixel - contribs->n0 ) == 1 ); // ensure that when we zap, we're at the 2nd pos + contribs->n0 = in_pixel; + } + contribs->n1 = in_pixel; + STBIR_ASSERT( ( in_pixel - contribs->n0 ) < coefficient_width ); + coeffs[in_pixel - contribs->n0] = coeff; + } + } + } + } +} + +#ifdef STBIR_RENORMALIZE_IN_FLOAT +#define STBIR_RENORM_TYPE float +#else +#define STBIR_RENORM_TYPE double +#endif + +static void stbir__cleanup_gathered_coefficients( stbir_edge edge, stbir__filter_extent_info* filter_info, stbir__scale_info * scale_info, int num_contributors, stbir__contributors* contributors, float * coefficient_group, int coefficient_width ) +{ + int input_size = scale_info->input_full_size; + int input_last_n1 = input_size - 1; + int n, end; + int lowest = 0x7fffffff; + int highest = -0x7fffffff; + int widest = -1; + int numerator = scale_info->scale_numerator; + int denominator = scale_info->scale_denominator; + int polyphase = ( ( scale_info->scale_is_rational ) && ( numerator < num_contributors ) ); + float * coeffs; + stbir__contributors * contribs; + + // weight all the coeffs for each sample + coeffs = coefficient_group; + contribs = contributors; + end = num_contributors; if ( polyphase ) end = numerator; + for (n = 0; n < end; n++) + { + int i; + STBIR_RENORM_TYPE filter_scale, total_filter = 0; + int e; + + // add all contribs + e = contribs->n1 - contribs->n0; + for( i = 0 ; i <= e ; i++ ) + { + total_filter += (STBIR_RENORM_TYPE) coeffs[i]; + STBIR_ASSERT( ( coeffs[i] >= -2.0f ) && ( coeffs[i] <= 2.0f ) ); // check for wonky weights + } + + // rescale + if ( ( total_filter < stbir__small_float ) && ( total_filter > -stbir__small_float ) ) + { + // all coeffs are extremely small, just zero it + contribs->n1 = contribs->n0; + coeffs[0] = 0.0f; + } + else + { + // if the total isn't 1.0, rescale everything + if ( ( total_filter < (1.0f-stbir__small_float) ) || ( total_filter > (1.0f+stbir__small_float) ) ) + { + filter_scale = ((STBIR_RENORM_TYPE)1.0) / total_filter; + + // scale them all + for (i = 0; i <= e; i++) + coeffs[i] = (float) ( coeffs[i] * filter_scale ); + } + } + ++contribs; + coeffs += coefficient_width; + } + + // if we have a rational for the scale, we can exploit the polyphaseness to not calculate + // most of the coefficients, so we copy them here + if ( polyphase ) + { + stbir__contributors * prev_contribs = contributors; + stbir__contributors * cur_contribs = contributors + numerator; + + for( n = numerator ; n < num_contributors ; n++ ) + { + cur_contribs->n0 = prev_contribs->n0 + denominator; + cur_contribs->n1 = prev_contribs->n1 + denominator; + ++cur_contribs; + ++prev_contribs; + } + stbir_overlapping_memcpy( coefficient_group + numerator * coefficient_width, coefficient_group, ( num_contributors - numerator ) * coefficient_width * sizeof( coeffs[ 0 ] ) ); + } + + coeffs = coefficient_group; + contribs = contributors; + + for (n = 0; n < num_contributors; n++) + { + int i; + + // in zero edge mode, just remove out of bounds contribs completely (since their weights are accounted for now) + if ( edge == STBIR_EDGE_ZERO ) + { + // shrink the right side if necessary + if ( contribs->n1 > input_last_n1 ) + contribs->n1 = input_last_n1; + + // shrink the left side + if ( contribs->n0 < 0 ) + { + int j, left, skips = 0; + + skips = -contribs->n0; + contribs->n0 = 0; + + // now move down the weights + left = contribs->n1 - contribs->n0 + 1; + if ( left > 0 ) + { + for( j = 0 ; j < left ; j++ ) + coeffs[ j ] = coeffs[ j + skips ]; + } + } + } + else if ( ( edge == STBIR_EDGE_CLAMP ) || ( edge == STBIR_EDGE_REFLECT ) ) + { + // for clamp and reflect, calculate the true inbounds position (based on edge type) and just add that to the existing weight + + // right hand side first + if ( contribs->n1 > input_last_n1 ) + { + int start = contribs->n0; + int endi = contribs->n1; + contribs->n1 = input_last_n1; + for( i = input_size; i <= endi; i++ ) + stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), coeffs[i-start], coefficient_width ); + } + + // now check left hand edge + if ( contribs->n0 < 0 ) + { + int save_n0; + float save_n0_coeff; + float * c = coeffs - ( contribs->n0 + 1 ); + + // reinsert the coeffs with it reflected or clamped (insert accumulates, if the coeffs exist) + for( i = -1 ; i > contribs->n0 ; i-- ) + stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( i, input_size ), *c--, coefficient_width ); + save_n0 = contribs->n0; + save_n0_coeff = c[0]; // save it, since we didn't do the final one (i==n0), because there might be too many coeffs to hold (before we resize)! + + // now slide all the coeffs down (since we have accumulated them in the positive contribs) and reset the first contrib + contribs->n0 = 0; + for(i = 0 ; i <= contribs->n1 ; i++ ) + coeffs[i] = coeffs[i-save_n0]; + + // now that we have shrunk down the contribs, we insert the first one safely + stbir__insert_coeff( contribs, coeffs, stbir__edge_wrap_slow[edge]( save_n0, input_size ), save_n0_coeff, coefficient_width ); + } + } + + if ( contribs->n0 <= contribs->n1 ) + { + int diff = contribs->n1 - contribs->n0 + 1; + while ( diff && ( coeffs[ diff-1 ] == 0.0f ) ) + --diff; + + contribs->n1 = contribs->n0 + diff - 1; + + if ( contribs->n0 <= contribs->n1 ) + { + if ( contribs->n0 < lowest ) + lowest = contribs->n0; + if ( contribs->n1 > highest ) + highest = contribs->n1; + if ( diff > widest ) + widest = diff; + } + + // re-zero out unused coefficients (if any) + for( i = diff ; i < coefficient_width ; i++ ) + coeffs[i] = 0.0f; + } + + ++contribs; + coeffs += coefficient_width; + } + filter_info->lowest = lowest; + filter_info->highest = highest; + filter_info->widest = widest; +} + +#undef STBIR_RENORM_TYPE + +static int stbir__pack_coefficients( int num_contributors, stbir__contributors* contributors, float * coefficents, int coefficient_width, int widest, int row0, int row1 ) +{ + #define STBIR_MOVE_1( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint32*)(dest))[0] = ((stbir_uint32*)(src))[0]; } + #define STBIR_MOVE_2( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; } + #ifdef STBIR_SIMD + #define STBIR_MOVE_4( dest, src ) { stbir__simdf t; STBIR_NO_UNROLL(dest); stbir__simdf_load( t, src ); stbir__simdf_store( dest, t ); } + #else + #define STBIR_MOVE_4( dest, src ) { STBIR_NO_UNROLL(dest); ((stbir_uint64*)(dest))[0] = ((stbir_uint64*)(src))[0]; ((stbir_uint64*)(dest))[1] = ((stbir_uint64*)(src))[1]; } + #endif + + int row_end = row1 + 1; + STBIR__UNUSED( row0 ); // only used in an assert + + if ( coefficient_width != widest ) + { + float * pc = coefficents; + float * coeffs = coefficents; + float * pc_end = coefficents + num_contributors * widest; + switch( widest ) + { + case 1: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_1( pc, coeffs ); + ++pc; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 2: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_2( pc, coeffs ); + pc += 2; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 3: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_2( pc, coeffs ); + STBIR_MOVE_1( pc+2, coeffs+2 ); + pc += 3; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 4: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + pc += 4; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 5: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_1( pc+4, coeffs+4 ); + pc += 5; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 6: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_2( pc+4, coeffs+4 ); + pc += 6; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 7: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_2( pc+4, coeffs+4 ); + STBIR_MOVE_1( pc+6, coeffs+6 ); + pc += 7; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 8: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_4( pc+4, coeffs+4 ); + pc += 8; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 9: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_4( pc+4, coeffs+4 ); + STBIR_MOVE_1( pc+8, coeffs+8 ); + pc += 9; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 10: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_4( pc+4, coeffs+4 ); + STBIR_MOVE_2( pc+8, coeffs+8 ); + pc += 10; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 11: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_4( pc+4, coeffs+4 ); + STBIR_MOVE_2( pc+8, coeffs+8 ); + STBIR_MOVE_1( pc+10, coeffs+10 ); + pc += 11; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + case 12: + STBIR_NO_UNROLL_LOOP_START + do { + STBIR_MOVE_4( pc, coeffs ); + STBIR_MOVE_4( pc+4, coeffs+4 ); + STBIR_MOVE_4( pc+8, coeffs+8 ); + pc += 12; + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + default: + STBIR_NO_UNROLL_LOOP_START + do { + float * copy_end = pc + widest - 4; + float * c = coeffs; + do { + STBIR_NO_UNROLL( pc ); + STBIR_MOVE_4( pc, c ); + pc += 4; + c += 4; + } while ( pc <= copy_end ); + copy_end += 4; + STBIR_NO_UNROLL_LOOP_START + while ( pc < copy_end ) + { + STBIR_MOVE_1( pc, c ); + ++pc; ++c; + } + coeffs += coefficient_width; + } while ( pc < pc_end ); + break; + } + } + + // some horizontal routines read one float off the end (which is then masked off), so put in a sentinal so we don't read an snan or denormal + coefficents[ widest * num_contributors ] = 8888.0f; + + // the minimum we might read for unrolled filters widths is 12. So, we need to + // make sure we never read outside the decode buffer, by possibly moving + // the sample area back into the scanline, and putting zeros weights first. + // we start on the right edge and check until we're well past the possible + // clip area (2*widest). + { + stbir__contributors * contribs = contributors + num_contributors - 1; + float * coeffs = coefficents + widest * ( num_contributors - 1 ); + + // go until no chance of clipping (this is usually less than 8 lops) + while ( ( contribs >= contributors ) && ( ( contribs->n0 + widest*2 ) >= row_end ) ) + { + // might we clip?? + if ( ( contribs->n0 + widest ) > row_end ) + { + int stop_range = widest; + + // if range is larger than 12, it will be handled by generic loops that can terminate on the exact length + // of this contrib n1, instead of a fixed widest amount - so calculate this + if ( widest > 12 ) + { + int mod; + + // how far will be read in the n_coeff loop (which depends on the widest count mod4); + mod = widest & 3; + stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; + + // the n_coeff loops do a minimum amount of coeffs, so factor that in! + if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod; + } + + // now see if we still clip with the refined range + if ( ( contribs->n0 + stop_range ) > row_end ) + { + int new_n0 = row_end - stop_range; + int num = contribs->n1 - contribs->n0 + 1; + int backup = contribs->n0 - new_n0; + float * from_co = coeffs + num - 1; + float * to_co = from_co + backup; + + STBIR_ASSERT( ( new_n0 >= row0 ) && ( new_n0 < contribs->n0 ) ); + + // move the coeffs over + while( num ) + { + *to_co-- = *from_co--; + --num; + } + // zero new positions + while ( to_co >= coeffs ) + *to_co-- = 0; + // set new start point + contribs->n0 = new_n0; + if ( widest > 12 ) + { + int mod; + + // how far will be read in the n_coeff loop (which depends on the widest count mod4); + mod = widest & 3; + stop_range = ( ( ( contribs->n1 - contribs->n0 + 1 ) - mod + 3 ) & ~3 ) + mod; + + // the n_coeff loops do a minimum amount of coeffs, so factor that in! + if ( stop_range < ( 8 + mod ) ) stop_range = 8 + mod; + } + } + } + --contribs; + coeffs -= widest; + } + } + + return widest; + #undef STBIR_MOVE_1 + #undef STBIR_MOVE_2 + #undef STBIR_MOVE_4 +} + +static void stbir__calculate_filters( stbir__sampler * samp, stbir__sampler * other_axis_for_pivot, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO ) +{ + int n; + float scale = samp->scale_info.scale; + stbir__kernel_callback * kernel = samp->filter_kernel; + stbir__support_callback * support = samp->filter_support; + float inv_scale = samp->scale_info.inv_scale; + int input_full_size = samp->scale_info.input_full_size; + int gather_num_contributors = samp->num_contributors; + stbir__contributors* gather_contributors = samp->contributors; + float * gather_coeffs = samp->coefficients; + int gather_coefficient_width = samp->coefficient_width; + + switch ( samp->is_gather ) + { + case 1: // gather upsample + { + float out_pixels_radius = support(inv_scale,user_data) * scale; + + stbir__calculate_coefficients_for_gather_upsample( out_pixels_radius, kernel, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width, samp->edge, user_data ); + + STBIR_PROFILE_BUILD_START( cleanup ); + stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width ); + STBIR_PROFILE_BUILD_END( cleanup ); + } + break; + + case 0: // scatter downsample (only on vertical) + case 2: // gather downsample + { + float in_pixels_radius = support(scale,user_data) * inv_scale; + int filter_pixel_margin = samp->filter_pixel_margin; + int input_end = input_full_size + filter_pixel_margin; + + // if this is a scatter, we do a downsample gather to get the coeffs, and then pivot after + if ( !samp->is_gather ) + { + // check if we are using the same gather downsample on the horizontal as this vertical, + // if so, then we don't have to generate them, we can just pivot from the horizontal. + if ( other_axis_for_pivot ) + { + gather_contributors = other_axis_for_pivot->contributors; + gather_coeffs = other_axis_for_pivot->coefficients; + gather_coefficient_width = other_axis_for_pivot->coefficient_width; + gather_num_contributors = other_axis_for_pivot->num_contributors; + samp->extent_info.lowest = other_axis_for_pivot->extent_info.lowest; + samp->extent_info.highest = other_axis_for_pivot->extent_info.highest; + samp->extent_info.widest = other_axis_for_pivot->extent_info.widest; + goto jump_right_to_pivot; + } + + gather_contributors = samp->gather_prescatter_contributors; + gather_coeffs = samp->gather_prescatter_coefficients; + gather_coefficient_width = samp->gather_prescatter_coefficient_width; + gather_num_contributors = samp->gather_prescatter_num_contributors; + } + + stbir__calculate_coefficients_for_gather_downsample( -filter_pixel_margin, input_end, in_pixels_radius, kernel, &samp->scale_info, gather_coefficient_width, gather_num_contributors, gather_contributors, gather_coeffs, user_data ); + + STBIR_PROFILE_BUILD_START( cleanup ); + stbir__cleanup_gathered_coefficients( samp->edge, &samp->extent_info, &samp->scale_info, gather_num_contributors, gather_contributors, gather_coeffs, gather_coefficient_width ); + STBIR_PROFILE_BUILD_END( cleanup ); + + if ( !samp->is_gather ) + { + // if this is a scatter (vertical only), then we need to pivot the coeffs + stbir__contributors * scatter_contributors; + int highest_set; + + jump_right_to_pivot: + + STBIR_PROFILE_BUILD_START( pivot ); + + highest_set = (-filter_pixel_margin) - 1; + for (n = 0; n < gather_num_contributors; n++) + { + int k; + int gn0 = gather_contributors->n0, gn1 = gather_contributors->n1; + int scatter_coefficient_width = samp->coefficient_width; + float * scatter_coeffs = samp->coefficients + ( gn0 + filter_pixel_margin ) * scatter_coefficient_width; + float * g_coeffs = gather_coeffs; + scatter_contributors = samp->contributors + ( gn0 + filter_pixel_margin ); + + for (k = gn0 ; k <= gn1 ; k++ ) + { + float gc = *g_coeffs++; + + // skip zero and denormals - must skip zeros to avoid adding coeffs beyond scatter_coefficient_width + // (which happens when pivoting from horizontal, which might have dummy zeros) + if ( ( ( gc >= stbir__small_float ) || ( gc <= -stbir__small_float ) ) ) + { + if ( ( k > highest_set ) || ( scatter_contributors->n0 > scatter_contributors->n1 ) ) + { + { + // if we are skipping over several contributors, we need to clear the skipped ones + stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1); + while ( clear_contributors < scatter_contributors ) + { + clear_contributors->n0 = 0; + clear_contributors->n1 = -1; + ++clear_contributors; + } + } + scatter_contributors->n0 = n; + scatter_contributors->n1 = n; + scatter_coeffs[0] = gc; + highest_set = k; + } + else + { + stbir__insert_coeff( scatter_contributors, scatter_coeffs, n, gc, scatter_coefficient_width ); + } + STBIR_ASSERT( ( scatter_contributors->n1 - scatter_contributors->n0 + 1 ) <= scatter_coefficient_width ); + } + ++scatter_contributors; + scatter_coeffs += scatter_coefficient_width; + } + + ++gather_contributors; + gather_coeffs += gather_coefficient_width; + } + + // now clear any unset contribs + { + stbir__contributors * clear_contributors = samp->contributors + ( highest_set + filter_pixel_margin + 1); + stbir__contributors * end_contributors = samp->contributors + samp->num_contributors; + while ( clear_contributors < end_contributors ) + { + clear_contributors->n0 = 0; + clear_contributors->n1 = -1; + ++clear_contributors; + } + } + + STBIR_PROFILE_BUILD_END( pivot ); + } + } + break; + } +} + + +//======================================================================================================== +// scanline decoders and encoders + +#define stbir__coder_min_num 1 +#define STB_IMAGE_RESIZE_DO_CODERS +#include STBIR__HEADER_FILENAME + +#define stbir__decode_suffix BGRA +#define stbir__decode_swizzle +#define stbir__decode_order0 2 +#define stbir__decode_order1 1 +#define stbir__decode_order2 0 +#define stbir__decode_order3 3 +#define stbir__encode_order0 2 +#define stbir__encode_order1 1 +#define stbir__encode_order2 0 +#define stbir__encode_order3 3 +#define stbir__coder_min_num 4 +#define STB_IMAGE_RESIZE_DO_CODERS +#include STBIR__HEADER_FILENAME + +#define stbir__decode_suffix ARGB +#define stbir__decode_swizzle +#define stbir__decode_order0 1 +#define stbir__decode_order1 2 +#define stbir__decode_order2 3 +#define stbir__decode_order3 0 +#define stbir__encode_order0 3 +#define stbir__encode_order1 0 +#define stbir__encode_order2 1 +#define stbir__encode_order3 2 +#define stbir__coder_min_num 4 +#define STB_IMAGE_RESIZE_DO_CODERS +#include STBIR__HEADER_FILENAME + +#define stbir__decode_suffix ABGR +#define stbir__decode_swizzle +#define stbir__decode_order0 3 +#define stbir__decode_order1 2 +#define stbir__decode_order2 1 +#define stbir__decode_order3 0 +#define stbir__encode_order0 3 +#define stbir__encode_order1 2 +#define stbir__encode_order2 1 +#define stbir__encode_order3 0 +#define stbir__coder_min_num 4 +#define STB_IMAGE_RESIZE_DO_CODERS +#include STBIR__HEADER_FILENAME + +#define stbir__decode_suffix AR +#define stbir__decode_swizzle +#define stbir__decode_order0 1 +#define stbir__decode_order1 0 +#define stbir__decode_order2 3 +#define stbir__decode_order3 2 +#define stbir__encode_order0 1 +#define stbir__encode_order1 0 +#define stbir__encode_order2 3 +#define stbir__encode_order3 2 +#define stbir__coder_min_num 2 +#define STB_IMAGE_RESIZE_DO_CODERS +#include STBIR__HEADER_FILENAME + + +// fancy alpha means we expand to keep both premultipied and non-premultiplied color channels +static void stbir__fancy_alpha_weight_4ch( float * out_buffer, int width_times_channels ) +{ + float STBIR_STREAMOUT_PTR(*) out = out_buffer; + float const * end_decode = out_buffer + ( width_times_channels / 4 ) * 7; // decode buffer aligned to end of out_buffer + float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels; + + // fancy alpha is stored internally as R G B A Rpm Gpm Bpm + + #ifdef STBIR_SIMD + + #ifdef STBIR_SIMD8 + decode += 16; + STBIR_NO_UNROLL_LOOP_START + while ( decode <= end_decode ) + { + stbir__simdf8 d0,d1,a0,a1,p0,p1; + STBIR_NO_UNROLL(decode); + stbir__simdf8_load( d0, decode-16 ); + stbir__simdf8_load( d1, decode-16+8 ); + stbir__simdf8_0123to33333333( a0, d0 ); + stbir__simdf8_0123to33333333( a1, d1 ); + stbir__simdf8_mult( p0, a0, d0 ); + stbir__simdf8_mult( p1, a1, d1 ); + stbir__simdf8_bot4s( a0, d0, p0 ); + stbir__simdf8_bot4s( a1, d1, p1 ); + stbir__simdf8_top4s( d0, d0, p0 ); + stbir__simdf8_top4s( d1, d1, p1 ); + stbir__simdf8_store ( out, a0 ); + stbir__simdf8_store ( out+7, d0 ); + stbir__simdf8_store ( out+14, a1 ); + stbir__simdf8_store ( out+21, d1 ); + decode += 16; + out += 28; + } + decode -= 16; + #else + decode += 8; + STBIR_NO_UNROLL_LOOP_START + while ( decode <= end_decode ) + { + stbir__simdf d0,a0,d1,a1,p0,p1; + STBIR_NO_UNROLL(decode); + stbir__simdf_load( d0, decode-8 ); + stbir__simdf_load( d1, decode-8+4 ); + stbir__simdf_0123to3333( a0, d0 ); + stbir__simdf_0123to3333( a1, d1 ); + stbir__simdf_mult( p0, a0, d0 ); + stbir__simdf_mult( p1, a1, d1 ); + stbir__simdf_store ( out, d0 ); + stbir__simdf_store ( out+4, p0 ); + stbir__simdf_store ( out+7, d1 ); + stbir__simdf_store ( out+7+4, p1 ); + decode += 8; + out += 14; + } + decode -= 8; + #endif + + // might be one last odd pixel + #ifdef STBIR_SIMD8 + STBIR_NO_UNROLL_LOOP_START + while ( decode < end_decode ) + #else + if ( decode < end_decode ) + #endif + { + stbir__simdf d,a,p; + STBIR_NO_UNROLL(decode); + stbir__simdf_load( d, decode ); + stbir__simdf_0123to3333( a, d ); + stbir__simdf_mult( p, a, d ); + stbir__simdf_store ( out, d ); + stbir__simdf_store ( out+4, p ); + decode += 4; + out += 7; + } + + #else + + while( decode < end_decode ) + { + float r = decode[0], g = decode[1], b = decode[2], alpha = decode[3]; + out[0] = r; + out[1] = g; + out[2] = b; + out[3] = alpha; + out[4] = r * alpha; + out[5] = g * alpha; + out[6] = b * alpha; + out += 7; + decode += 4; + } + + #endif +} + +static void stbir__fancy_alpha_weight_2ch( float * out_buffer, int width_times_channels ) +{ + float STBIR_STREAMOUT_PTR(*) out = out_buffer; + float const * end_decode = out_buffer + ( width_times_channels / 2 ) * 3; + float STBIR_STREAMOUT_PTR(*) decode = (float*)end_decode - width_times_channels; + + // for fancy alpha, turns into: [X A Xpm][X A Xpm],etc + + #ifdef STBIR_SIMD + + decode += 8; + if ( decode <= end_decode ) + { + STBIR_NO_UNROLL_LOOP_START + do { + #ifdef STBIR_SIMD8 + stbir__simdf8 d0,a0,p0; + STBIR_NO_UNROLL(decode); + stbir__simdf8_load( d0, decode-8 ); + stbir__simdf8_0123to11331133( p0, d0 ); + stbir__simdf8_0123to00220022( a0, d0 ); + stbir__simdf8_mult( p0, p0, a0 ); + + stbir__simdf_store2( out, stbir__if_simdf8_cast_to_simdf4( d0 ) ); + stbir__simdf_store( out+2, stbir__if_simdf8_cast_to_simdf4( p0 ) ); + stbir__simdf_store2h( out+3, stbir__if_simdf8_cast_to_simdf4( d0 ) ); + + stbir__simdf_store2( out+6, stbir__simdf8_gettop4( d0 ) ); + stbir__simdf_store( out+8, stbir__simdf8_gettop4( p0 ) ); + stbir__simdf_store2h( out+9, stbir__simdf8_gettop4( d0 ) ); + #else + stbir__simdf d0,a0,d1,a1,p0,p1; + STBIR_NO_UNROLL(decode); + stbir__simdf_load( d0, decode-8 ); + stbir__simdf_load( d1, decode-8+4 ); + stbir__simdf_0123to1133( p0, d0 ); + stbir__simdf_0123to1133( p1, d1 ); + stbir__simdf_0123to0022( a0, d0 ); + stbir__simdf_0123to0022( a1, d1 ); + stbir__simdf_mult( p0, p0, a0 ); + stbir__simdf_mult( p1, p1, a1 ); + + stbir__simdf_store2( out, d0 ); + stbir__simdf_store( out+2, p0 ); + stbir__simdf_store2h( out+3, d0 ); + + stbir__simdf_store2( out+6, d1 ); + stbir__simdf_store( out+8, p1 ); + stbir__simdf_store2h( out+9, d1 ); + #endif + decode += 8; + out += 12; + } while ( decode <= end_decode ); + } + decode -= 8; + #endif + + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode < end_decode ) + { + float x = decode[0], y = decode[1]; + STBIR_SIMD_NO_UNROLL(decode); + out[0] = x; + out[1] = y; + out[2] = x * y; + out += 3; + decode += 2; + } +} + +static void stbir__fancy_alpha_unweight_4ch( float * encode_buffer, int width_times_channels ) +{ + float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer; + float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer; + float const * end_output = encode_buffer + width_times_channels; + + // fancy RGBA is stored internally as R G B A Rpm Gpm Bpm + + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float alpha = input[3]; +#ifdef STBIR_SIMD + stbir__simdf i,ia; + STBIR_SIMD_NO_UNROLL(encode); + if ( alpha < stbir__small_float ) + { + stbir__simdf_load( i, input ); + stbir__simdf_store( encode, i ); + } + else + { + stbir__simdf_load1frep4( ia, 1.0f / alpha ); + stbir__simdf_load( i, input+4 ); + stbir__simdf_mult( i, i, ia ); + stbir__simdf_store( encode, i ); + encode[3] = alpha; + } +#else + if ( alpha < stbir__small_float ) + { + encode[0] = input[0]; + encode[1] = input[1]; + encode[2] = input[2]; + } + else + { + float ialpha = 1.0f / alpha; + encode[0] = input[4] * ialpha; + encode[1] = input[5] * ialpha; + encode[2] = input[6] * ialpha; + } + encode[3] = alpha; +#endif + + input += 7; + encode += 4; + } while ( encode < end_output ); +} + +// format: [X A Xpm][X A Xpm] etc +static void stbir__fancy_alpha_unweight_2ch( float * encode_buffer, int width_times_channels ) +{ + float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer; + float STBIR_SIMD_STREAMOUT_PTR(*) input = encode_buffer; + float const * end_output = encode_buffer + width_times_channels; + + do { + float alpha = input[1]; + encode[0] = input[0]; + if ( alpha >= stbir__small_float ) + encode[0] = input[2] / alpha; + encode[1] = alpha; + + input += 3; + encode += 2; + } while ( encode < end_output ); +} + +static void stbir__simple_alpha_weight_4ch( float * decode_buffer, int width_times_channels ) +{ + float STBIR_STREAMOUT_PTR(*) decode = decode_buffer; + float const * end_decode = decode_buffer + width_times_channels; + + #ifdef STBIR_SIMD + { + decode += 2 * stbir__simdfX_float_count; + STBIR_NO_UNROLL_LOOP_START + while ( decode <= end_decode ) + { + stbir__simdfX d0,a0,d1,a1; + STBIR_NO_UNROLL(decode); + stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count ); + stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count ); + stbir__simdfX_aaa1( a0, d0, STBIR_onesX ); + stbir__simdfX_aaa1( a1, d1, STBIR_onesX ); + stbir__simdfX_mult( d0, d0, a0 ); + stbir__simdfX_mult( d1, d1, a1 ); + stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 ); + stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 ); + decode += 2 * stbir__simdfX_float_count; + } + decode -= 2 * stbir__simdfX_float_count; + + // few last pixels remnants + #ifdef STBIR_SIMD8 + STBIR_NO_UNROLL_LOOP_START + while ( decode < end_decode ) + #else + if ( decode < end_decode ) + #endif + { + stbir__simdf d,a; + stbir__simdf_load( d, decode ); + stbir__simdf_aaa1( a, d, STBIR__CONSTF(STBIR_ones) ); + stbir__simdf_mult( d, d, a ); + stbir__simdf_store ( decode, d ); + decode += 4; + } + } + + #else + + while( decode < end_decode ) + { + float alpha = decode[3]; + decode[0] *= alpha; + decode[1] *= alpha; + decode[2] *= alpha; + decode += 4; + } + + #endif +} + +static void stbir__simple_alpha_weight_2ch( float * decode_buffer, int width_times_channels ) +{ + float STBIR_STREAMOUT_PTR(*) decode = decode_buffer; + float const * end_decode = decode_buffer + width_times_channels; + + #ifdef STBIR_SIMD + decode += 2 * stbir__simdfX_float_count; + STBIR_NO_UNROLL_LOOP_START + while ( decode <= end_decode ) + { + stbir__simdfX d0,a0,d1,a1; + STBIR_NO_UNROLL(decode); + stbir__simdfX_load( d0, decode-2*stbir__simdfX_float_count ); + stbir__simdfX_load( d1, decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count ); + stbir__simdfX_a1a1( a0, d0, STBIR_onesX ); + stbir__simdfX_a1a1( a1, d1, STBIR_onesX ); + stbir__simdfX_mult( d0, d0, a0 ); + stbir__simdfX_mult( d1, d1, a1 ); + stbir__simdfX_store ( decode-2*stbir__simdfX_float_count, d0 ); + stbir__simdfX_store ( decode-2*stbir__simdfX_float_count+stbir__simdfX_float_count, d1 ); + decode += 2 * stbir__simdfX_float_count; + } + decode -= 2 * stbir__simdfX_float_count; + #endif + + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode < end_decode ) + { + float alpha = decode[1]; + STBIR_SIMD_NO_UNROLL(decode); + decode[0] *= alpha; + decode += 2; + } +} + +static void stbir__simple_alpha_unweight_4ch( float * encode_buffer, int width_times_channels ) +{ + float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer; + float const * end_output = encode_buffer + width_times_channels; + + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float alpha = encode[3]; + +#ifdef STBIR_SIMD + stbir__simdf i,ia; + STBIR_SIMD_NO_UNROLL(encode); + if ( alpha >= stbir__small_float ) + { + stbir__simdf_load1frep4( ia, 1.0f / alpha ); + stbir__simdf_load( i, encode ); + stbir__simdf_mult( i, i, ia ); + stbir__simdf_store( encode, i ); + encode[3] = alpha; + } +#else + if ( alpha >= stbir__small_float ) + { + float ialpha = 1.0f / alpha; + encode[0] *= ialpha; + encode[1] *= ialpha; + encode[2] *= ialpha; + } +#endif + encode += 4; + } while ( encode < end_output ); +} + +static void stbir__simple_alpha_unweight_2ch( float * encode_buffer, int width_times_channels ) +{ + float STBIR_SIMD_STREAMOUT_PTR(*) encode = encode_buffer; + float const * end_output = encode_buffer + width_times_channels; + + do { + float alpha = encode[1]; + if ( alpha >= stbir__small_float ) + encode[0] /= alpha; + encode += 2; + } while ( encode < end_output ); +} + + +// only used in RGB->BGR or BGR->RGB +static void stbir__simple_flip_3ch( float * decode_buffer, int width_times_channels ) +{ + float STBIR_STREAMOUT_PTR(*) decode = decode_buffer; + float const * end_decode = decode_buffer + width_times_channels; + +#ifdef STBIR_SIMD + #ifdef stbir__simdf_swiz2 // do we have two argument swizzles? + end_decode -= 12; + STBIR_NO_UNROLL_LOOP_START + while( decode <= end_decode ) + { + // on arm64 8 instructions, no overlapping stores + stbir__simdf a,b,c,na,nb; + STBIR_SIMD_NO_UNROLL(decode); + stbir__simdf_load( a, decode ); + stbir__simdf_load( b, decode+4 ); + stbir__simdf_load( c, decode+8 ); + + na = stbir__simdf_swiz2( a, b, 2, 1, 0, 5 ); + b = stbir__simdf_swiz2( a, b, 4, 3, 6, 7 ); + nb = stbir__simdf_swiz2( b, c, 0, 1, 4, 3 ); + c = stbir__simdf_swiz2( b, c, 2, 7, 6, 5 ); + + stbir__simdf_store( decode, na ); + stbir__simdf_store( decode+4, nb ); + stbir__simdf_store( decode+8, c ); + decode += 12; + } + end_decode += 12; + #else + end_decode -= 24; + STBIR_NO_UNROLL_LOOP_START + while( decode <= end_decode ) + { + // 26 instructions on x64 + stbir__simdf a,b,c,d,e,f,g; + float i21, i23; + STBIR_SIMD_NO_UNROLL(decode); + stbir__simdf_load( a, decode ); + stbir__simdf_load( b, decode+3 ); + stbir__simdf_load( c, decode+6 ); + stbir__simdf_load( d, decode+9 ); + stbir__simdf_load( e, decode+12 ); + stbir__simdf_load( f, decode+15 ); + stbir__simdf_load( g, decode+18 ); + + a = stbir__simdf_swiz( a, 2, 1, 0, 3 ); + b = stbir__simdf_swiz( b, 2, 1, 0, 3 ); + c = stbir__simdf_swiz( c, 2, 1, 0, 3 ); + d = stbir__simdf_swiz( d, 2, 1, 0, 3 ); + e = stbir__simdf_swiz( e, 2, 1, 0, 3 ); + f = stbir__simdf_swiz( f, 2, 1, 0, 3 ); + g = stbir__simdf_swiz( g, 2, 1, 0, 3 ); + + // stores overlap, need to be in order, + stbir__simdf_store( decode, a ); + i21 = decode[21]; + stbir__simdf_store( decode+3, b ); + i23 = decode[23]; + stbir__simdf_store( decode+6, c ); + stbir__simdf_store( decode+9, d ); + stbir__simdf_store( decode+12, e ); + stbir__simdf_store( decode+15, f ); + stbir__simdf_store( decode+18, g ); + decode[21] = i23; + decode[23] = i21; + decode += 24; + } + end_decode += 24; + #endif +#else + end_decode -= 12; + STBIR_NO_UNROLL_LOOP_START + while( decode <= end_decode ) + { + // 16 instructions + float t0,t1,t2,t3; + STBIR_NO_UNROLL(decode); + t0 = decode[0]; t1 = decode[3]; t2 = decode[6]; t3 = decode[9]; + decode[0] = decode[2]; decode[3] = decode[5]; decode[6] = decode[8]; decode[9] = decode[11]; + decode[2] = t0; decode[5] = t1; decode[8] = t2; decode[11] = t3; + decode += 12; + } + end_decode += 12; +#endif + + STBIR_NO_UNROLL_LOOP_START + while( decode < end_decode ) + { + float t = decode[0]; + STBIR_NO_UNROLL(decode); + decode[0] = decode[2]; + decode[2] = t; + decode += 3; + } +} + + + +static void stbir__decode_scanline(stbir__info const * stbir_info, int n, float * output_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO ) +{ + int channels = stbir_info->channels; + int effective_channels = stbir_info->effective_channels; + int input_sample_in_bytes = stbir__type_size[stbir_info->input_type] * channels; + stbir_edge edge_horizontal = stbir_info->horizontal.edge; + stbir_edge edge_vertical = stbir_info->vertical.edge; + int row = stbir__edge_wrap(edge_vertical, n, stbir_info->vertical.scale_info.input_full_size); + const void* input_plane_data = ( (char *) stbir_info->input_data ) + (size_t)row * (size_t) stbir_info->input_stride_bytes; + stbir__span const * spans = stbir_info->scanline_extents.spans; + float * full_decode_buffer = output_buffer - stbir_info->scanline_extents.conservative.n0 * effective_channels; + float * last_decoded = 0; + + // if we are on edge_zero, and we get in here with an out of bounds n, then the calculate filters has failed + STBIR_ASSERT( !(edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->vertical.scale_info.input_full_size)) ); + + do + { + float * decode_buffer; + void const * input_data; + float * end_decode; + int width_times_channels; + int width; + + if ( spans->n1 < spans->n0 ) + break; + + width = spans->n1 + 1 - spans->n0; + decode_buffer = full_decode_buffer + spans->n0 * effective_channels; + end_decode = full_decode_buffer + ( spans->n1 + 1 ) * effective_channels; + width_times_channels = width * channels; + + // read directly out of input plane by default + input_data = ( (char*)input_plane_data ) + spans->pixel_offset_for_input * input_sample_in_bytes; + + // if we have an input callback, call it to get the input data + if ( stbir_info->in_pixels_cb ) + { + // call the callback with a temp buffer (that they can choose to use or not). the temp is just right aligned memory in the decode_buffer itself + input_data = stbir_info->in_pixels_cb( ( (char*) end_decode ) - ( width * input_sample_in_bytes ) + ( ( stbir_info->input_type != STBIR_TYPE_FLOAT ) ? ( sizeof(float)*STBIR_INPUT_CALLBACK_PADDING ) : 0 ), input_plane_data, width, spans->pixel_offset_for_input, row, stbir_info->user_data ); + } + + STBIR_PROFILE_START( decode ); + // convert the pixels info the float decode_buffer, (we index from end_decode, so that when channelsdecode_pixels( (float*)end_decode - width_times_channels, width_times_channels, input_data ); + STBIR_PROFILE_END( decode ); + + if (stbir_info->alpha_weight) + { + STBIR_PROFILE_START( alpha ); + stbir_info->alpha_weight( decode_buffer, width_times_channels ); + STBIR_PROFILE_END( alpha ); + } + + ++spans; + } while ( spans <= ( &stbir_info->scanline_extents.spans[1] ) ); + + // handle the edge_wrap filter (all other types are handled back out at the calculate_filter stage) + // basically the idea here is that if we have the whole scanline in memory, we don't redecode the + // wrapped edge pixels, and instead just memcpy them from the scanline into the edge positions + if ( ( edge_horizontal == STBIR_EDGE_WRAP ) && ( stbir_info->scanline_extents.edge_sizes[0] | stbir_info->scanline_extents.edge_sizes[1] ) ) + { + // this code only runs if we're in edge_wrap, and we're doing the entire scanline + int e, start_x[2]; + int input_full_size = stbir_info->horizontal.scale_info.input_full_size; + + start_x[0] = -stbir_info->scanline_extents.edge_sizes[0]; // left edge start x + start_x[1] = input_full_size; // right edge + + for( e = 0; e < 2 ; e++ ) + { + // do each margin + int margin = stbir_info->scanline_extents.edge_sizes[e]; + if ( margin ) + { + int x = start_x[e]; + float * marg = full_decode_buffer + x * effective_channels; + float const * src = full_decode_buffer + stbir__edge_wrap(edge_horizontal, x, input_full_size) * effective_channels; + STBIR_MEMCPY( marg, src, margin * effective_channels * sizeof(float) ); + if ( e == 1 ) last_decoded = marg + margin * effective_channels; + } + } + } + + // some of the horizontal gathers read one float off the edge (which is masked out), but we force a zero here to make sure no NaNs leak in + // (we can't pre-zero it, because the input callback can use that area as padding) + last_decoded[0] = 0.0f; + + // we clear this extra float, because the final output pixel filter kernel might have used one less coeff than the max filter width + // when this happens, we do read that pixel from the input, so it too could be Nan, so just zero an extra one. + // this fits because each scanline is padded by three floats (STBIR_INPUT_CALLBACK_PADDING) + last_decoded[1] = 0.0f; +} + + +//================= +// Do 1 channel horizontal routines + +#ifdef STBIR_SIMD + +#define stbir__1_coeff_only() \ + stbir__simdf tot,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1( c, hc ); \ + stbir__simdf_mult1_mem( tot, c, decode ); + +#define stbir__2_coeff_only() \ + stbir__simdf tot,c,d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2z( c, hc ); \ + stbir__simdf_load2( d, decode ); \ + stbir__simdf_mult( tot, c, d ); \ + stbir__simdf_0123to1230( c, tot ); \ + stbir__simdf_add1( tot, tot, c ); + +#define stbir__3_coeff_only() \ + stbir__simdf tot,c,t; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( c, hc ); \ + stbir__simdf_mult_mem( tot, c, decode ); \ + stbir__simdf_0123to1230( c, tot ); \ + stbir__simdf_0123to2301( t, tot ); \ + stbir__simdf_add1( tot, tot, c ); \ + stbir__simdf_add1( tot, tot, t ); + +#define stbir__store_output_tiny() \ + stbir__simdf_store1( output, tot ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 1; + +#define stbir__4_coeff_start() \ + stbir__simdf tot,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( c, hc ); \ + stbir__simdf_mult_mem( tot, c, decode ); \ + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( c, hc + (ofs) ); \ + stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) ); + +#define stbir__1_coeff_remnant( ofs ) \ + { stbir__simdf d; \ + stbir__simdf_load1z( c, hc + (ofs) ); \ + stbir__simdf_load1( d, decode + (ofs) ); \ + stbir__simdf_madd( tot, tot, d, c ); } + +#define stbir__2_coeff_remnant( ofs ) \ + { stbir__simdf d; \ + stbir__simdf_load2z( c, hc+(ofs) ); \ + stbir__simdf_load2( d, decode+(ofs) ); \ + stbir__simdf_madd( tot, tot, d, c ); } + +#define stbir__3_coeff_setup() \ + stbir__simdf mask; \ + stbir__simdf_load( mask, STBIR_mask + 3 ); + +#define stbir__3_coeff_remnant( ofs ) \ + stbir__simdf_load( c, hc+(ofs) ); \ + stbir__simdf_and( c, c, mask ); \ + stbir__simdf_madd_mem( tot, tot, c, decode+(ofs) ); + +#define stbir__store_output() \ + stbir__simdf_0123to2301( c, tot ); \ + stbir__simdf_add( tot, tot, c ); \ + stbir__simdf_0123to1230( c, tot ); \ + stbir__simdf_add1( tot, tot, c ); \ + stbir__simdf_store1( output, tot ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 1; + +#else + +#define stbir__1_coeff_only() \ + float tot; \ + tot = decode[0]*hc[0]; + +#define stbir__2_coeff_only() \ + float tot; \ + tot = decode[0] * hc[0]; \ + tot += decode[1] * hc[1]; + +#define stbir__3_coeff_only() \ + float tot; \ + tot = decode[0] * hc[0]; \ + tot += decode[1] * hc[1]; \ + tot += decode[2] * hc[2]; + +#define stbir__store_output_tiny() \ + output[0] = tot; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 1; + +#define stbir__4_coeff_start() \ + float tot0,tot1,tot2,tot3; \ + tot0 = decode[0] * hc[0]; \ + tot1 = decode[1] * hc[1]; \ + tot2 = decode[2] * hc[2]; \ + tot3 = decode[3] * hc[3]; + +#define stbir__4_coeff_continue_from_4( ofs ) \ + tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \ + tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \ + tot2 += decode[2+(ofs)] * hc[2+(ofs)]; \ + tot3 += decode[3+(ofs)] * hc[3+(ofs)]; + +#define stbir__1_coeff_remnant( ofs ) \ + tot0 += decode[0+(ofs)] * hc[0+(ofs)]; + +#define stbir__2_coeff_remnant( ofs ) \ + tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \ + tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \ + +#define stbir__3_coeff_remnant( ofs ) \ + tot0 += decode[0+(ofs)] * hc[0+(ofs)]; \ + tot1 += decode[1+(ofs)] * hc[1+(ofs)]; \ + tot2 += decode[2+(ofs)] * hc[2+(ofs)]; + +#define stbir__store_output() \ + output[0] = (tot0+tot2)+(tot1+tot3); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 1; + +#endif + +#define STBIR__horizontal_channels 1 +#define STB_IMAGE_RESIZE_DO_HORIZONTALS +#include STBIR__HEADER_FILENAME + + +//================= +// Do 2 channel horizontal routines + +#ifdef STBIR_SIMD + +#define stbir__1_coeff_only() \ + stbir__simdf tot,c,d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1z( c, hc ); \ + stbir__simdf_0123to0011( c, c ); \ + stbir__simdf_load2( d, decode ); \ + stbir__simdf_mult( tot, d, c ); + +#define stbir__2_coeff_only() \ + stbir__simdf tot,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2( c, hc ); \ + stbir__simdf_0123to0011( c, c ); \ + stbir__simdf_mult_mem( tot, c, decode ); + +#define stbir__3_coeff_only() \ + stbir__simdf tot,c,cs,d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0011( c, cs ); \ + stbir__simdf_mult_mem( tot, c, decode ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_load2z( d, decode+4 ); \ + stbir__simdf_madd( tot, tot, d, c ); + +#define stbir__store_output_tiny() \ + stbir__simdf_0123to2301( c, tot ); \ + stbir__simdf_add( tot, tot, c ); \ + stbir__simdf_store2( output, tot ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 2; + +#ifdef STBIR_SIMD8 + +#define stbir__4_coeff_start() \ + stbir__simdf8 tot0,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc ); \ + stbir__simdf8_0123to00112233( c, cs ); \ + stbir__simdf8_mult_mem( tot0, c, decode ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00112233( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); + +#define stbir__1_coeff_remnant( ofs ) \ + { stbir__simdf t,d; \ + stbir__simdf_load1z( t, hc + (ofs) ); \ + stbir__simdf_load2( d, decode + (ofs) * 2 ); \ + stbir__simdf_0123to0011( t, t ); \ + stbir__simdf_mult( t, t, d ); \ + stbir__simdf8_add4( tot0, tot0, t ); } + +#define stbir__2_coeff_remnant( ofs ) \ + { stbir__simdf t; \ + stbir__simdf_load2( t, hc + (ofs) ); \ + stbir__simdf_0123to0011( t, t ); \ + stbir__simdf_mult_mem( t, t, decode+(ofs)*2 ); \ + stbir__simdf8_add4( tot0, tot0, t ); } + +#define stbir__3_coeff_remnant( ofs ) \ + { stbir__simdf8 d; \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00112233( c, cs ); \ + stbir__simdf8_load6z( d, decode+(ofs)*2 ); \ + stbir__simdf8_madd( tot0, tot0, c, d ); } + +#define stbir__store_output() \ + { stbir__simdf t,d; \ + stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \ + stbir__simdf_0123to2301( d, t ); \ + stbir__simdf_add( t, t, d ); \ + stbir__simdf_store2( output, t ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 2; } + +#else + +#define stbir__4_coeff_start() \ + stbir__simdf tot0,tot1,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0011( c, cs ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_0123to2233( c, cs ); \ + stbir__simdf_mult_mem( tot1, c, decode+4 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0011( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \ + stbir__simdf_0123to2233( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*2+4 ); + +#define stbir__1_coeff_remnant( ofs ) \ + { stbir__simdf d; \ + stbir__simdf_load1z( cs, hc + (ofs) ); \ + stbir__simdf_0123to0011( c, cs ); \ + stbir__simdf_load2( d, decode + (ofs) * 2 ); \ + stbir__simdf_madd( tot0, tot0, d, c ); } + +#define stbir__2_coeff_remnant( ofs ) \ + stbir__simdf_load2( cs, hc + (ofs) ); \ + stbir__simdf_0123to0011( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); + +#define stbir__3_coeff_remnant( ofs ) \ + { stbir__simdf d; \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0011( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*2 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_load2z( d, decode + (ofs) * 2 + 4 ); \ + stbir__simdf_madd( tot1, tot1, d, c ); } + +#define stbir__store_output() \ + stbir__simdf_add( tot0, tot0, tot1 ); \ + stbir__simdf_0123to2301( c, tot0 ); \ + stbir__simdf_add( tot0, tot0, c ); \ + stbir__simdf_store2( output, tot0 ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 2; + +#endif + +#else + +#define stbir__1_coeff_only() \ + float tota,totb,c; \ + c = hc[0]; \ + tota = decode[0]*c; \ + totb = decode[1]*c; + +#define stbir__2_coeff_only() \ + float tota,totb,c; \ + c = hc[0]; \ + tota = decode[0]*c; \ + totb = decode[1]*c; \ + c = hc[1]; \ + tota += decode[2]*c; \ + totb += decode[3]*c; + +// this weird order of add matches the simd +#define stbir__3_coeff_only() \ + float tota,totb,c; \ + c = hc[0]; \ + tota = decode[0]*c; \ + totb = decode[1]*c; \ + c = hc[2]; \ + tota += decode[4]*c; \ + totb += decode[5]*c; \ + c = hc[1]; \ + tota += decode[2]*c; \ + totb += decode[3]*c; + +#define stbir__store_output_tiny() \ + output[0] = tota; \ + output[1] = totb; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 2; + +#define stbir__4_coeff_start() \ + float tota0,tota1,tota2,tota3,totb0,totb1,totb2,totb3,c; \ + c = hc[0]; \ + tota0 = decode[0]*c; \ + totb0 = decode[1]*c; \ + c = hc[1]; \ + tota1 = decode[2]*c; \ + totb1 = decode[3]*c; \ + c = hc[2]; \ + tota2 = decode[4]*c; \ + totb2 = decode[5]*c; \ + c = hc[3]; \ + tota3 = decode[6]*c; \ + totb3 = decode[7]*c; + +#define stbir__4_coeff_continue_from_4( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*2]*c; \ + totb0 += decode[1+(ofs)*2]*c; \ + c = hc[1+(ofs)]; \ + tota1 += decode[2+(ofs)*2]*c; \ + totb1 += decode[3+(ofs)*2]*c; \ + c = hc[2+(ofs)]; \ + tota2 += decode[4+(ofs)*2]*c; \ + totb2 += decode[5+(ofs)*2]*c; \ + c = hc[3+(ofs)]; \ + tota3 += decode[6+(ofs)*2]*c; \ + totb3 += decode[7+(ofs)*2]*c; + +#define stbir__1_coeff_remnant( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*2] * c; \ + totb0 += decode[1+(ofs)*2] * c; + +#define stbir__2_coeff_remnant( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*2] * c; \ + totb0 += decode[1+(ofs)*2] * c; \ + c = hc[1+(ofs)]; \ + tota1 += decode[2+(ofs)*2] * c; \ + totb1 += decode[3+(ofs)*2] * c; + +#define stbir__3_coeff_remnant( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*2] * c; \ + totb0 += decode[1+(ofs)*2] * c; \ + c = hc[1+(ofs)]; \ + tota1 += decode[2+(ofs)*2] * c; \ + totb1 += decode[3+(ofs)*2] * c; \ + c = hc[2+(ofs)]; \ + tota2 += decode[4+(ofs)*2] * c; \ + totb2 += decode[5+(ofs)*2] * c; + +#define stbir__store_output() \ + output[0] = (tota0+tota2)+(tota1+tota3); \ + output[1] = (totb0+totb2)+(totb1+totb3); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 2; + +#endif + +#define STBIR__horizontal_channels 2 +#define STB_IMAGE_RESIZE_DO_HORIZONTALS +#include STBIR__HEADER_FILENAME + + +//================= +// Do 3 channel horizontal routines + +#ifdef STBIR_SIMD + +#define stbir__1_coeff_only() \ + stbir__simdf tot,c,d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1z( c, hc ); \ + stbir__simdf_0123to0001( c, c ); \ + stbir__simdf_load( d, decode ); \ + stbir__simdf_mult( tot, d, c ); + +#define stbir__2_coeff_only() \ + stbir__simdf tot,c,cs,d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_load( d, decode ); \ + stbir__simdf_mult( tot, d, c ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_load( d, decode+3 ); \ + stbir__simdf_madd( tot, tot, d, c ); + +#define stbir__3_coeff_only() \ + stbir__simdf tot,c,d,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_load( d, decode ); \ + stbir__simdf_mult( tot, d, c ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_load( d, decode+3 ); \ + stbir__simdf_madd( tot, tot, d, c ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_load( d, decode+6 ); \ + stbir__simdf_madd( tot, tot, d, c ); + +#define stbir__store_output_tiny() \ + stbir__simdf_store2( output, tot ); \ + stbir__simdf_0123to2301( tot, tot ); \ + stbir__simdf_store1( output+2, tot ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 3; + +#ifdef STBIR_SIMD8 + +// we're loading from the XXXYYY decode by -1 to get the XXXYYY into different halves of the AVX reg fyi +#define stbir__4_coeff_start() \ + stbir__simdf8 tot0,tot1,c,cs; stbir__simdf t; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc ); \ + stbir__simdf8_0123to00001111( c, cs ); \ + stbir__simdf8_mult_mem( tot0, c, decode - 1 ); \ + stbir__simdf8_0123to22223333( c, cs ); \ + stbir__simdf8_mult_mem( tot1, c, decode+6 - 1 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00001111( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \ + stbir__simdf8_0123to22223333( c, cs ); \ + stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*3 + 6 - 1 ); + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1rep4( t, hc + (ofs) ); \ + stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*3 - 1 ); + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \ + stbir__simdf8_0123to22223333( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); + + #define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00001111( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*3 - 1 ); \ + stbir__simdf8_0123to2222( t, cs ); \ + stbir__simdf8_madd_mem4( tot1, tot1, t, decode+(ofs)*3 + 6 - 1 ); + +#define stbir__store_output() \ + stbir__simdf8_add( tot0, tot0, tot1 ); \ + stbir__simdf_0123to1230( t, stbir__if_simdf8_cast_to_simdf4( tot0 ) ); \ + stbir__simdf8_add4halves( t, t, tot0 ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 3; \ + if ( output < output_end ) \ + { \ + stbir__simdf_store( output-3, t ); \ + continue; \ + } \ + { stbir__simdf tt; stbir__simdf_0123to2301( tt, t ); \ + stbir__simdf_store2( output-3, t ); \ + stbir__simdf_store1( output+2-3, tt ); } \ + break; + + +#else + +#define stbir__4_coeff_start() \ + stbir__simdf tot0,tot1,tot2,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0001( c, cs ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_0123to1122( c, cs ); \ + stbir__simdf_mult_mem( tot1, c, decode+4 ); \ + stbir__simdf_0123to2333( c, cs ); \ + stbir__simdf_mult_mem( tot2, c, decode+8 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0001( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \ + stbir__simdf_0123to1122( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \ + stbir__simdf_0123to2333( c, cs ); \ + stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*3+8 ); + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1z( c, hc + (ofs) ); \ + stbir__simdf_0123to0001( c, c ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); + +#define stbir__2_coeff_remnant( ofs ) \ + { stbir__simdf d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2z( cs, hc + (ofs) ); \ + stbir__simdf_0123to0001( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \ + stbir__simdf_0123to1122( c, cs ); \ + stbir__simdf_load2z( d, decode+(ofs)*3+4 ); \ + stbir__simdf_madd( tot1, tot1, c, d ); } + +#define stbir__3_coeff_remnant( ofs ) \ + { stbir__simdf d; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0001( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*3 ); \ + stbir__simdf_0123to1122( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*3+4 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_load1z( d, decode+(ofs)*3+8 ); \ + stbir__simdf_madd( tot2, tot2, c, d ); } + +#define stbir__store_output() \ + stbir__simdf_0123ABCDto3ABx( c, tot0, tot1 ); \ + stbir__simdf_0123ABCDto23Ax( cs, tot1, tot2 ); \ + stbir__simdf_0123to1230( tot2, tot2 ); \ + stbir__simdf_add( tot0, tot0, cs ); \ + stbir__simdf_add( c, c, tot2 ); \ + stbir__simdf_add( tot0, tot0, c ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 3; \ + if ( output < output_end ) \ + { \ + stbir__simdf_store( output-3, tot0 ); \ + continue; \ + } \ + stbir__simdf_0123to2301( tot1, tot0 ); \ + stbir__simdf_store2( output-3, tot0 ); \ + stbir__simdf_store1( output+2-3, tot1 ); \ + break; + +#endif + +#else + +#define stbir__1_coeff_only() \ + float tot0, tot1, tot2, c; \ + c = hc[0]; \ + tot0 = decode[0]*c; \ + tot1 = decode[1]*c; \ + tot2 = decode[2]*c; + +#define stbir__2_coeff_only() \ + float tot0, tot1, tot2, c; \ + c = hc[0]; \ + tot0 = decode[0]*c; \ + tot1 = decode[1]*c; \ + tot2 = decode[2]*c; \ + c = hc[1]; \ + tot0 += decode[3]*c; \ + tot1 += decode[4]*c; \ + tot2 += decode[5]*c; + +#define stbir__3_coeff_only() \ + float tot0, tot1, tot2, c; \ + c = hc[0]; \ + tot0 = decode[0]*c; \ + tot1 = decode[1]*c; \ + tot2 = decode[2]*c; \ + c = hc[1]; \ + tot0 += decode[3]*c; \ + tot1 += decode[4]*c; \ + tot2 += decode[5]*c; \ + c = hc[2]; \ + tot0 += decode[6]*c; \ + tot1 += decode[7]*c; \ + tot2 += decode[8]*c; + +#define stbir__store_output_tiny() \ + output[0] = tot0; \ + output[1] = tot1; \ + output[2] = tot2; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 3; + +#define stbir__4_coeff_start() \ + float tota0,tota1,tota2,totb0,totb1,totb2,totc0,totc1,totc2,totd0,totd1,totd2,c; \ + c = hc[0]; \ + tota0 = decode[0]*c; \ + tota1 = decode[1]*c; \ + tota2 = decode[2]*c; \ + c = hc[1]; \ + totb0 = decode[3]*c; \ + totb1 = decode[4]*c; \ + totb2 = decode[5]*c; \ + c = hc[2]; \ + totc0 = decode[6]*c; \ + totc1 = decode[7]*c; \ + totc2 = decode[8]*c; \ + c = hc[3]; \ + totd0 = decode[9]*c; \ + totd1 = decode[10]*c; \ + totd2 = decode[11]*c; + +#define stbir__4_coeff_continue_from_4( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*3]*c; \ + tota1 += decode[1+(ofs)*3]*c; \ + tota2 += decode[2+(ofs)*3]*c; \ + c = hc[1+(ofs)]; \ + totb0 += decode[3+(ofs)*3]*c; \ + totb1 += decode[4+(ofs)*3]*c; \ + totb2 += decode[5+(ofs)*3]*c; \ + c = hc[2+(ofs)]; \ + totc0 += decode[6+(ofs)*3]*c; \ + totc1 += decode[7+(ofs)*3]*c; \ + totc2 += decode[8+(ofs)*3]*c; \ + c = hc[3+(ofs)]; \ + totd0 += decode[9+(ofs)*3]*c; \ + totd1 += decode[10+(ofs)*3]*c; \ + totd2 += decode[11+(ofs)*3]*c; + +#define stbir__1_coeff_remnant( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*3]*c; \ + tota1 += decode[1+(ofs)*3]*c; \ + tota2 += decode[2+(ofs)*3]*c; + +#define stbir__2_coeff_remnant( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*3]*c; \ + tota1 += decode[1+(ofs)*3]*c; \ + tota2 += decode[2+(ofs)*3]*c; \ + c = hc[1+(ofs)]; \ + totb0 += decode[3+(ofs)*3]*c; \ + totb1 += decode[4+(ofs)*3]*c; \ + totb2 += decode[5+(ofs)*3]*c; \ + +#define stbir__3_coeff_remnant( ofs ) \ + c = hc[0+(ofs)]; \ + tota0 += decode[0+(ofs)*3]*c; \ + tota1 += decode[1+(ofs)*3]*c; \ + tota2 += decode[2+(ofs)*3]*c; \ + c = hc[1+(ofs)]; \ + totb0 += decode[3+(ofs)*3]*c; \ + totb1 += decode[4+(ofs)*3]*c; \ + totb2 += decode[5+(ofs)*3]*c; \ + c = hc[2+(ofs)]; \ + totc0 += decode[6+(ofs)*3]*c; \ + totc1 += decode[7+(ofs)*3]*c; \ + totc2 += decode[8+(ofs)*3]*c; + +#define stbir__store_output() \ + output[0] = (tota0+totc0)+(totb0+totd0); \ + output[1] = (tota1+totc1)+(totb1+totd1); \ + output[2] = (tota2+totc2)+(totb2+totd2); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 3; + +#endif + +#define STBIR__horizontal_channels 3 +#define STB_IMAGE_RESIZE_DO_HORIZONTALS +#include STBIR__HEADER_FILENAME + +//================= +// Do 4 channel horizontal routines + +#ifdef STBIR_SIMD + +#define stbir__1_coeff_only() \ + stbir__simdf tot,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1( c, hc ); \ + stbir__simdf_0123to0000( c, c ); \ + stbir__simdf_mult_mem( tot, c, decode ); + +#define stbir__2_coeff_only() \ + stbir__simdf tot,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_mult_mem( tot, c, decode ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot, tot, c, decode+4 ); + +#define stbir__3_coeff_only() \ + stbir__simdf tot,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_mult_mem( tot, c, decode ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot, tot, c, decode+4 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot, tot, c, decode+8 ); + +#define stbir__store_output_tiny() \ + stbir__simdf_store( output, tot ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 4; + +#ifdef STBIR_SIMD8 + +#define stbir__4_coeff_start() \ + stbir__simdf8 tot0,c,cs; stbir__simdf t; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc ); \ + stbir__simdf8_0123to00001111( c, cs ); \ + stbir__simdf8_mult_mem( tot0, c, decode ); \ + stbir__simdf8_0123to22223333( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+8 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00001111( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \ + stbir__simdf8_0123to22223333( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 ); + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1rep4( t, hc + (ofs) ); \ + stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4 ); + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) - 2 ); \ + stbir__simdf8_0123to22223333( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); + + #define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00001111( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \ + stbir__simdf8_0123to2222( t, cs ); \ + stbir__simdf8_madd_mem4( tot0, tot0, t, decode+(ofs)*4+8 ); + +#define stbir__store_output() \ + stbir__simdf8_add4halves( t, stbir__if_simdf8_cast_to_simdf4(tot0), tot0 ); \ + stbir__simdf_store( output, t ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 4; + +#else + +#define stbir__4_coeff_start() \ + stbir__simdf tot0,tot1,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_mult_mem( tot1, c, decode+4 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+8 ); \ + stbir__simdf_0123to3333( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+12 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 ); \ + stbir__simdf_0123to3333( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+12 ); + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1( c, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, c ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2( cs, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); + +#define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*4+4 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*4+8 ); + +#define stbir__store_output() \ + stbir__simdf_add( tot0, tot0, tot1 ); \ + stbir__simdf_store( output, tot0 ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 4; + +#endif + +#else + +#define stbir__1_coeff_only() \ + float p0,p1,p2,p3,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0]; \ + p0 = decode[0] * c; \ + p1 = decode[1] * c; \ + p2 = decode[2] * c; \ + p3 = decode[3] * c; + +#define stbir__2_coeff_only() \ + float p0,p1,p2,p3,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0]; \ + p0 = decode[0] * c; \ + p1 = decode[1] * c; \ + p2 = decode[2] * c; \ + p3 = decode[3] * c; \ + c = hc[1]; \ + p0 += decode[4] * c; \ + p1 += decode[5] * c; \ + p2 += decode[6] * c; \ + p3 += decode[7] * c; + +#define stbir__3_coeff_only() \ + float p0,p1,p2,p3,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0]; \ + p0 = decode[0] * c; \ + p1 = decode[1] * c; \ + p2 = decode[2] * c; \ + p3 = decode[3] * c; \ + c = hc[1]; \ + p0 += decode[4] * c; \ + p1 += decode[5] * c; \ + p2 += decode[6] * c; \ + p3 += decode[7] * c; \ + c = hc[2]; \ + p0 += decode[8] * c; \ + p1 += decode[9] * c; \ + p2 += decode[10] * c; \ + p3 += decode[11] * c; + +#define stbir__store_output_tiny() \ + output[0] = p0; \ + output[1] = p1; \ + output[2] = p2; \ + output[3] = p3; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 4; + +#define stbir__4_coeff_start() \ + float x0,x1,x2,x3,y0,y1,y2,y3,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0]; \ + x0 = decode[0] * c; \ + x1 = decode[1] * c; \ + x2 = decode[2] * c; \ + x3 = decode[3] * c; \ + c = hc[1]; \ + y0 = decode[4] * c; \ + y1 = decode[5] * c; \ + y2 = decode[6] * c; \ + y3 = decode[7] * c; \ + c = hc[2]; \ + x0 += decode[8] * c; \ + x1 += decode[9] * c; \ + x2 += decode[10] * c; \ + x3 += decode[11] * c; \ + c = hc[3]; \ + y0 += decode[12] * c; \ + y1 += decode[13] * c; \ + y2 += decode[14] * c; \ + y3 += decode[15] * c; + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*4] * c; \ + x1 += decode[1+(ofs)*4] * c; \ + x2 += decode[2+(ofs)*4] * c; \ + x3 += decode[3+(ofs)*4] * c; \ + c = hc[1+(ofs)]; \ + y0 += decode[4+(ofs)*4] * c; \ + y1 += decode[5+(ofs)*4] * c; \ + y2 += decode[6+(ofs)*4] * c; \ + y3 += decode[7+(ofs)*4] * c; \ + c = hc[2+(ofs)]; \ + x0 += decode[8+(ofs)*4] * c; \ + x1 += decode[9+(ofs)*4] * c; \ + x2 += decode[10+(ofs)*4] * c; \ + x3 += decode[11+(ofs)*4] * c; \ + c = hc[3+(ofs)]; \ + y0 += decode[12+(ofs)*4] * c; \ + y1 += decode[13+(ofs)*4] * c; \ + y2 += decode[14+(ofs)*4] * c; \ + y3 += decode[15+(ofs)*4] * c; + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*4] * c; \ + x1 += decode[1+(ofs)*4] * c; \ + x2 += decode[2+(ofs)*4] * c; \ + x3 += decode[3+(ofs)*4] * c; + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*4] * c; \ + x1 += decode[1+(ofs)*4] * c; \ + x2 += decode[2+(ofs)*4] * c; \ + x3 += decode[3+(ofs)*4] * c; \ + c = hc[1+(ofs)]; \ + y0 += decode[4+(ofs)*4] * c; \ + y1 += decode[5+(ofs)*4] * c; \ + y2 += decode[6+(ofs)*4] * c; \ + y3 += decode[7+(ofs)*4] * c; + +#define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*4] * c; \ + x1 += decode[1+(ofs)*4] * c; \ + x2 += decode[2+(ofs)*4] * c; \ + x3 += decode[3+(ofs)*4] * c; \ + c = hc[1+(ofs)]; \ + y0 += decode[4+(ofs)*4] * c; \ + y1 += decode[5+(ofs)*4] * c; \ + y2 += decode[6+(ofs)*4] * c; \ + y3 += decode[7+(ofs)*4] * c; \ + c = hc[2+(ofs)]; \ + x0 += decode[8+(ofs)*4] * c; \ + x1 += decode[9+(ofs)*4] * c; \ + x2 += decode[10+(ofs)*4] * c; \ + x3 += decode[11+(ofs)*4] * c; + +#define stbir__store_output() \ + output[0] = x0 + y0; \ + output[1] = x1 + y1; \ + output[2] = x2 + y2; \ + output[3] = x3 + y3; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 4; + +#endif + +#define STBIR__horizontal_channels 4 +#define STB_IMAGE_RESIZE_DO_HORIZONTALS +#include STBIR__HEADER_FILENAME + + + +//================= +// Do 7 channel horizontal routines + +#ifdef STBIR_SIMD + +#define stbir__1_coeff_only() \ + stbir__simdf tot0,tot1,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1( c, hc ); \ + stbir__simdf_0123to0000( c, c ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_mult_mem( tot1, c, decode+3 ); + +#define stbir__2_coeff_only() \ + stbir__simdf tot0,tot1,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_mult_mem( tot1, c, decode+3 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \ + stbir__simdf_madd_mem( tot1, tot1, c,decode+10 ); + +#define stbir__3_coeff_only() \ + stbir__simdf tot0,tot1,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_mult_mem( tot1, c, decode+3 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+7 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+10 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+17 ); + +#define stbir__store_output_tiny() \ + stbir__simdf_store( output+3, tot1 ); \ + stbir__simdf_store( output, tot0 ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 7; + +#ifdef STBIR_SIMD8 + +#define stbir__4_coeff_start() \ + stbir__simdf8 tot0,tot1,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc ); \ + stbir__simdf8_0123to00000000( c, cs ); \ + stbir__simdf8_mult_mem( tot0, c, decode ); \ + stbir__simdf8_0123to11111111( c, cs ); \ + stbir__simdf8_mult_mem( tot1, c, decode+7 ); \ + stbir__simdf8_0123to22222222( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+14 ); \ + stbir__simdf8_0123to33333333( c, cs ); \ + stbir__simdf8_madd_mem( tot1, tot1, c, decode+21 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00000000( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf8_0123to11111111( c, cs ); \ + stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \ + stbir__simdf8_0123to22222222( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \ + stbir__simdf8_0123to33333333( c, cs ); \ + stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+21 ); + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load1b( c, hc + (ofs) ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load1b( c, hc + (ofs) ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf8_load1b( c, hc + (ofs)+1 ); \ + stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); + +#define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf8_load4b( cs, hc + (ofs) ); \ + stbir__simdf8_0123to00000000( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf8_0123to11111111( c, cs ); \ + stbir__simdf8_madd_mem( tot1, tot1, c, decode+(ofs)*7+7 ); \ + stbir__simdf8_0123to22222222( c, cs ); \ + stbir__simdf8_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); + +#define stbir__store_output() \ + stbir__simdf8_add( tot0, tot0, tot1 ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 7; \ + if ( output < output_end ) \ + { \ + stbir__simdf8_store( output-7, tot0 ); \ + continue; \ + } \ + stbir__simdf_store( output-7+3, stbir__simdf_swiz(stbir__simdf8_gettop4(tot0),0,0,1,2) ); \ + stbir__simdf_store( output-7, stbir__if_simdf8_cast_to_simdf4(tot0) ); \ + break; + +#else + +#define stbir__4_coeff_start() \ + stbir__simdf tot0,tot1,tot2,tot3,c,cs; \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_mult_mem( tot0, c, decode ); \ + stbir__simdf_mult_mem( tot1, c, decode+3 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_mult_mem( tot2, c, decode+7 ); \ + stbir__simdf_mult_mem( tot3, c, decode+10 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+14 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+17 ); \ + stbir__simdf_0123to3333( c, cs ); \ + stbir__simdf_madd_mem( tot2, tot2, c, decode+21 ); \ + stbir__simdf_madd_mem( tot3, tot3, c, decode+24 ); + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \ + stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 ); \ + stbir__simdf_0123to3333( c, cs ); \ + stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+21 ); \ + stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+24 ); + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load1( c, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, c ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \ + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load2( cs, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \ + stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); + +#define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + stbir__simdf_load( cs, hc + (ofs) ); \ + stbir__simdf_0123to0000( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+3 ); \ + stbir__simdf_0123to1111( c, cs ); \ + stbir__simdf_madd_mem( tot2, tot2, c, decode+(ofs)*7+7 ); \ + stbir__simdf_madd_mem( tot3, tot3, c, decode+(ofs)*7+10 ); \ + stbir__simdf_0123to2222( c, cs ); \ + stbir__simdf_madd_mem( tot0, tot0, c, decode+(ofs)*7+14 ); \ + stbir__simdf_madd_mem( tot1, tot1, c, decode+(ofs)*7+17 ); + +#define stbir__store_output() \ + stbir__simdf_add( tot0, tot0, tot2 ); \ + stbir__simdf_add( tot1, tot1, tot3 ); \ + stbir__simdf_store( output+3, tot1 ); \ + stbir__simdf_store( output, tot0 ); \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 7; + +#endif + +#else + +#define stbir__1_coeff_only() \ + float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \ + c = hc[0]; \ + tot0 = decode[0]*c; \ + tot1 = decode[1]*c; \ + tot2 = decode[2]*c; \ + tot3 = decode[3]*c; \ + tot4 = decode[4]*c; \ + tot5 = decode[5]*c; \ + tot6 = decode[6]*c; + +#define stbir__2_coeff_only() \ + float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \ + c = hc[0]; \ + tot0 = decode[0]*c; \ + tot1 = decode[1]*c; \ + tot2 = decode[2]*c; \ + tot3 = decode[3]*c; \ + tot4 = decode[4]*c; \ + tot5 = decode[5]*c; \ + tot6 = decode[6]*c; \ + c = hc[1]; \ + tot0 += decode[7]*c; \ + tot1 += decode[8]*c; \ + tot2 += decode[9]*c; \ + tot3 += decode[10]*c; \ + tot4 += decode[11]*c; \ + tot5 += decode[12]*c; \ + tot6 += decode[13]*c; \ + +#define stbir__3_coeff_only() \ + float tot0, tot1, tot2, tot3, tot4, tot5, tot6, c; \ + c = hc[0]; \ + tot0 = decode[0]*c; \ + tot1 = decode[1]*c; \ + tot2 = decode[2]*c; \ + tot3 = decode[3]*c; \ + tot4 = decode[4]*c; \ + tot5 = decode[5]*c; \ + tot6 = decode[6]*c; \ + c = hc[1]; \ + tot0 += decode[7]*c; \ + tot1 += decode[8]*c; \ + tot2 += decode[9]*c; \ + tot3 += decode[10]*c; \ + tot4 += decode[11]*c; \ + tot5 += decode[12]*c; \ + tot6 += decode[13]*c; \ + c = hc[2]; \ + tot0 += decode[14]*c; \ + tot1 += decode[15]*c; \ + tot2 += decode[16]*c; \ + tot3 += decode[17]*c; \ + tot4 += decode[18]*c; \ + tot5 += decode[19]*c; \ + tot6 += decode[20]*c; \ + +#define stbir__store_output_tiny() \ + output[0] = tot0; \ + output[1] = tot1; \ + output[2] = tot2; \ + output[3] = tot3; \ + output[4] = tot4; \ + output[5] = tot5; \ + output[6] = tot6; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 7; + +#define stbir__4_coeff_start() \ + float x0,x1,x2,x3,x4,x5,x6,y0,y1,y2,y3,y4,y5,y6,c; \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0]; \ + x0 = decode[0] * c; \ + x1 = decode[1] * c; \ + x2 = decode[2] * c; \ + x3 = decode[3] * c; \ + x4 = decode[4] * c; \ + x5 = decode[5] * c; \ + x6 = decode[6] * c; \ + c = hc[1]; \ + y0 = decode[7] * c; \ + y1 = decode[8] * c; \ + y2 = decode[9] * c; \ + y3 = decode[10] * c; \ + y4 = decode[11] * c; \ + y5 = decode[12] * c; \ + y6 = decode[13] * c; \ + c = hc[2]; \ + x0 += decode[14] * c; \ + x1 += decode[15] * c; \ + x2 += decode[16] * c; \ + x3 += decode[17] * c; \ + x4 += decode[18] * c; \ + x5 += decode[19] * c; \ + x6 += decode[20] * c; \ + c = hc[3]; \ + y0 += decode[21] * c; \ + y1 += decode[22] * c; \ + y2 += decode[23] * c; \ + y3 += decode[24] * c; \ + y4 += decode[25] * c; \ + y5 += decode[26] * c; \ + y6 += decode[27] * c; + +#define stbir__4_coeff_continue_from_4( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*7] * c; \ + x1 += decode[1+(ofs)*7] * c; \ + x2 += decode[2+(ofs)*7] * c; \ + x3 += decode[3+(ofs)*7] * c; \ + x4 += decode[4+(ofs)*7] * c; \ + x5 += decode[5+(ofs)*7] * c; \ + x6 += decode[6+(ofs)*7] * c; \ + c = hc[1+(ofs)]; \ + y0 += decode[7+(ofs)*7] * c; \ + y1 += decode[8+(ofs)*7] * c; \ + y2 += decode[9+(ofs)*7] * c; \ + y3 += decode[10+(ofs)*7] * c; \ + y4 += decode[11+(ofs)*7] * c; \ + y5 += decode[12+(ofs)*7] * c; \ + y6 += decode[13+(ofs)*7] * c; \ + c = hc[2+(ofs)]; \ + x0 += decode[14+(ofs)*7] * c; \ + x1 += decode[15+(ofs)*7] * c; \ + x2 += decode[16+(ofs)*7] * c; \ + x3 += decode[17+(ofs)*7] * c; \ + x4 += decode[18+(ofs)*7] * c; \ + x5 += decode[19+(ofs)*7] * c; \ + x6 += decode[20+(ofs)*7] * c; \ + c = hc[3+(ofs)]; \ + y0 += decode[21+(ofs)*7] * c; \ + y1 += decode[22+(ofs)*7] * c; \ + y2 += decode[23+(ofs)*7] * c; \ + y3 += decode[24+(ofs)*7] * c; \ + y4 += decode[25+(ofs)*7] * c; \ + y5 += decode[26+(ofs)*7] * c; \ + y6 += decode[27+(ofs)*7] * c; + +#define stbir__1_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*7] * c; \ + x1 += decode[1+(ofs)*7] * c; \ + x2 += decode[2+(ofs)*7] * c; \ + x3 += decode[3+(ofs)*7] * c; \ + x4 += decode[4+(ofs)*7] * c; \ + x5 += decode[5+(ofs)*7] * c; \ + x6 += decode[6+(ofs)*7] * c; \ + +#define stbir__2_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*7] * c; \ + x1 += decode[1+(ofs)*7] * c; \ + x2 += decode[2+(ofs)*7] * c; \ + x3 += decode[3+(ofs)*7] * c; \ + x4 += decode[4+(ofs)*7] * c; \ + x5 += decode[5+(ofs)*7] * c; \ + x6 += decode[6+(ofs)*7] * c; \ + c = hc[1+(ofs)]; \ + y0 += decode[7+(ofs)*7] * c; \ + y1 += decode[8+(ofs)*7] * c; \ + y2 += decode[9+(ofs)*7] * c; \ + y3 += decode[10+(ofs)*7] * c; \ + y4 += decode[11+(ofs)*7] * c; \ + y5 += decode[12+(ofs)*7] * c; \ + y6 += decode[13+(ofs)*7] * c; \ + +#define stbir__3_coeff_remnant( ofs ) \ + STBIR_SIMD_NO_UNROLL(decode); \ + c = hc[0+(ofs)]; \ + x0 += decode[0+(ofs)*7] * c; \ + x1 += decode[1+(ofs)*7] * c; \ + x2 += decode[2+(ofs)*7] * c; \ + x3 += decode[3+(ofs)*7] * c; \ + x4 += decode[4+(ofs)*7] * c; \ + x5 += decode[5+(ofs)*7] * c; \ + x6 += decode[6+(ofs)*7] * c; \ + c = hc[1+(ofs)]; \ + y0 += decode[7+(ofs)*7] * c; \ + y1 += decode[8+(ofs)*7] * c; \ + y2 += decode[9+(ofs)*7] * c; \ + y3 += decode[10+(ofs)*7] * c; \ + y4 += decode[11+(ofs)*7] * c; \ + y5 += decode[12+(ofs)*7] * c; \ + y6 += decode[13+(ofs)*7] * c; \ + c = hc[2+(ofs)]; \ + x0 += decode[14+(ofs)*7] * c; \ + x1 += decode[15+(ofs)*7] * c; \ + x2 += decode[16+(ofs)*7] * c; \ + x3 += decode[17+(ofs)*7] * c; \ + x4 += decode[18+(ofs)*7] * c; \ + x5 += decode[19+(ofs)*7] * c; \ + x6 += decode[20+(ofs)*7] * c; \ + +#define stbir__store_output() \ + output[0] = x0 + y0; \ + output[1] = x1 + y1; \ + output[2] = x2 + y2; \ + output[3] = x3 + y3; \ + output[4] = x4 + y4; \ + output[5] = x5 + y5; \ + output[6] = x6 + y6; \ + horizontal_coefficients += coefficient_width; \ + ++horizontal_contributors; \ + output += 7; + +#endif + +#define STBIR__horizontal_channels 7 +#define STB_IMAGE_RESIZE_DO_HORIZONTALS +#include STBIR__HEADER_FILENAME + + +// include all of the vertical resamplers (both scatter and gather versions) + +#define STBIR__vertical_channels 1 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 1 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 2 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 2 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 3 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 3 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 4 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 4 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 5 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 5 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 6 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 6 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 7 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 7 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 8 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#include STBIR__HEADER_FILENAME + +#define STBIR__vertical_channels 8 +#define STB_IMAGE_RESIZE_DO_VERTICALS +#define STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#include STBIR__HEADER_FILENAME + +typedef void STBIR_VERTICAL_GATHERFUNC( float * output, float const * coeffs, float const ** inputs, float const * input0_end ); + +static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers[ 8 ] = +{ + stbir__vertical_gather_with_1_coeffs,stbir__vertical_gather_with_2_coeffs,stbir__vertical_gather_with_3_coeffs,stbir__vertical_gather_with_4_coeffs,stbir__vertical_gather_with_5_coeffs,stbir__vertical_gather_with_6_coeffs,stbir__vertical_gather_with_7_coeffs,stbir__vertical_gather_with_8_coeffs +}; + +static STBIR_VERTICAL_GATHERFUNC * stbir__vertical_gathers_continues[ 8 ] = +{ + stbir__vertical_gather_with_1_coeffs_cont,stbir__vertical_gather_with_2_coeffs_cont,stbir__vertical_gather_with_3_coeffs_cont,stbir__vertical_gather_with_4_coeffs_cont,stbir__vertical_gather_with_5_coeffs_cont,stbir__vertical_gather_with_6_coeffs_cont,stbir__vertical_gather_with_7_coeffs_cont,stbir__vertical_gather_with_8_coeffs_cont +}; + +typedef void STBIR_VERTICAL_SCATTERFUNC( float ** outputs, float const * coeffs, float const * input, float const * input_end ); + +static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_sets[ 8 ] = +{ + stbir__vertical_scatter_with_1_coeffs,stbir__vertical_scatter_with_2_coeffs,stbir__vertical_scatter_with_3_coeffs,stbir__vertical_scatter_with_4_coeffs,stbir__vertical_scatter_with_5_coeffs,stbir__vertical_scatter_with_6_coeffs,stbir__vertical_scatter_with_7_coeffs,stbir__vertical_scatter_with_8_coeffs +}; + +static STBIR_VERTICAL_SCATTERFUNC * stbir__vertical_scatter_blends[ 8 ] = +{ + stbir__vertical_scatter_with_1_coeffs_cont,stbir__vertical_scatter_with_2_coeffs_cont,stbir__vertical_scatter_with_3_coeffs_cont,stbir__vertical_scatter_with_4_coeffs_cont,stbir__vertical_scatter_with_5_coeffs_cont,stbir__vertical_scatter_with_6_coeffs_cont,stbir__vertical_scatter_with_7_coeffs_cont,stbir__vertical_scatter_with_8_coeffs_cont +}; + + +static void stbir__encode_scanline( stbir__info const * stbir_info, void *output_buffer_data, float * encode_buffer, int row STBIR_ONLY_PROFILE_GET_SPLIT_INFO ) +{ + int num_pixels = stbir_info->horizontal.scale_info.output_sub_size; + int channels = stbir_info->channels; + int width_times_channels = num_pixels * channels; + void * output_buffer; + + // un-alpha weight if we need to + if ( stbir_info->alpha_unweight ) + { + STBIR_PROFILE_START( unalpha ); + stbir_info->alpha_unweight( encode_buffer, width_times_channels ); + STBIR_PROFILE_END( unalpha ); + } + + // write directly into output by default + output_buffer = output_buffer_data; + + // if we have an output callback, we first convert the decode buffer in place (and then hand that to the callback) + if ( stbir_info->out_pixels_cb ) + output_buffer = encode_buffer; + + STBIR_PROFILE_START( encode ); + // convert into the output buffer + stbir_info->encode_pixels( output_buffer, width_times_channels, encode_buffer ); + STBIR_PROFILE_END( encode ); + + // if we have an output callback, call it to send the data + if ( stbir_info->out_pixels_cb ) + stbir_info->out_pixels_cb( output_buffer, num_pixels, row, stbir_info->user_data ); +} + + +// Get the ring buffer pointer for an index +static float* stbir__get_ring_buffer_entry(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int index ) +{ + STBIR_ASSERT( index < stbir_info->ring_buffer_num_entries ); + + #ifdef STBIR__SEPARATE_ALLOCATIONS + return split_info->ring_buffers[ index ]; + #else + return (float*) ( ( (char*) split_info->ring_buffer ) + ( index * stbir_info->ring_buffer_length_bytes ) ); + #endif +} + +// Get the specified scan line from the ring buffer +static float* stbir__get_ring_buffer_scanline(stbir__info const * stbir_info, stbir__per_split_info const * split_info, int get_scanline) +{ + int ring_buffer_index = (split_info->ring_buffer_begin_index + (get_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries; + return stbir__get_ring_buffer_entry( stbir_info, split_info, ring_buffer_index ); +} + +static void stbir__resample_horizontal_gather(stbir__info const * stbir_info, float* output_buffer, float const * input_buffer STBIR_ONLY_PROFILE_GET_SPLIT_INFO ) +{ + float const * decode_buffer = input_buffer - ( stbir_info->scanline_extents.conservative.n0 * stbir_info->effective_channels ); + + STBIR_PROFILE_START( horizontal ); + if ( ( stbir_info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( stbir_info->horizontal.scale_info.scale == 1.0f ) ) + STBIR_MEMCPY( output_buffer, input_buffer, stbir_info->horizontal.scale_info.output_sub_size * sizeof( float ) * stbir_info->effective_channels ); + else + stbir_info->horizontal_gather_channels( output_buffer, stbir_info->horizontal.scale_info.output_sub_size, decode_buffer, stbir_info->horizontal.contributors, stbir_info->horizontal.coefficients, stbir_info->horizontal.coefficient_width ); + STBIR_PROFILE_END( horizontal ); +} + +static void stbir__resample_vertical_gather(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n, int contrib_n0, int contrib_n1, float const * vertical_coefficients ) +{ + float* encode_buffer = split_info->vertical_buffer; + float* decode_buffer = split_info->decode_buffer; + int vertical_first = stbir_info->vertical_first; + int width = (vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size; + int width_times_channels = stbir_info->effective_channels * width; + + STBIR_ASSERT( stbir_info->vertical.is_gather ); + + // loop over the contributing scanlines and scale into the buffer + STBIR_PROFILE_START( vertical ); + { + int k = 0, total = contrib_n1 - contrib_n0 + 1; + STBIR_ASSERT( total > 0 ); + do { + float const * inputs[8]; + int i, cnt = total; if ( cnt > 8 ) cnt = 8; + for( i = 0 ; i < cnt ; i++ ) + inputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+contrib_n0 ); + + // call the N scanlines at a time function (up to 8 scanlines of blending at once) + ((k==0)?stbir__vertical_gathers:stbir__vertical_gathers_continues)[cnt-1]( (vertical_first) ? decode_buffer : encode_buffer, vertical_coefficients + k, inputs, inputs[0] + width_times_channels ); + k += cnt; + total -= cnt; + } while ( total ); + } + STBIR_PROFILE_END( vertical ); + + if ( vertical_first ) + { + // Now resample the gathered vertical data in the horizontal axis into the encode buffer + decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3 + decode_buffer[ width_times_channels+1 ] = 0.0f; + stbir__resample_horizontal_gather(stbir_info, encode_buffer, decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + } + + stbir__encode_scanline( stbir_info, ( (char *) stbir_info->output_data ) + ((size_t)n * (size_t)stbir_info->output_stride_bytes), + encode_buffer, n STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); +} + +static void stbir__decode_and_resample_for_vertical_gather_loop(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n) +{ + int ring_buffer_index; + float* ring_buffer; + + // Decode the nth scanline from the source image into the decode buffer. + stbir__decode_scanline( stbir_info, n, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // update new end scanline + split_info->ring_buffer_last_scanline = n; + + // get ring buffer + ring_buffer_index = (split_info->ring_buffer_begin_index + (split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries; + ring_buffer = stbir__get_ring_buffer_entry(stbir_info, split_info, ring_buffer_index); + + // Now resample it into the ring buffer. + stbir__resample_horizontal_gather( stbir_info, ring_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling. +} + +static void stbir__vertical_gather_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count ) +{ + int y, start_output_y, end_output_y; + stbir__contributors* vertical_contributors = stbir_info->vertical.contributors; + float const * vertical_coefficients = stbir_info->vertical.coefficients; + + STBIR_ASSERT( stbir_info->vertical.is_gather ); + + start_output_y = split_info->start_output_y; + end_output_y = split_info[split_count-1].end_output_y; + + vertical_contributors += start_output_y; + vertical_coefficients += start_output_y * stbir_info->vertical.coefficient_width; + + // initialize the ring buffer for gathering + split_info->ring_buffer_begin_index = 0; + split_info->ring_buffer_first_scanline = vertical_contributors->n0; + split_info->ring_buffer_last_scanline = split_info->ring_buffer_first_scanline - 1; // means "empty" + + for (y = start_output_y; y < end_output_y; y++) + { + int in_first_scanline, in_last_scanline; + + in_first_scanline = vertical_contributors->n0; + in_last_scanline = vertical_contributors->n1; + + // make sure the indexing hasn't broken + STBIR_ASSERT( in_first_scanline >= split_info->ring_buffer_first_scanline ); + + // Load in new scanlines + while (in_last_scanline > split_info->ring_buffer_last_scanline) + { + STBIR_ASSERT( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) <= stbir_info->ring_buffer_num_entries ); + + // make sure there was room in the ring buffer when we add new scanlines + if ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) + { + split_info->ring_buffer_first_scanline++; + split_info->ring_buffer_begin_index++; + } + + if ( stbir_info->vertical_first ) + { + float * ring_buffer = stbir__get_ring_buffer_scanline( stbir_info, split_info, ++split_info->ring_buffer_last_scanline ); + // Decode the nth scanline from the source image into the decode buffer. + stbir__decode_scanline( stbir_info, split_info->ring_buffer_last_scanline, ring_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + } + else + { + stbir__decode_and_resample_for_vertical_gather_loop(stbir_info, split_info, split_info->ring_buffer_last_scanline + 1); + } + } + + // Now all buffers should be ready to write a row of vertical sampling, so do it. + stbir__resample_vertical_gather(stbir_info, split_info, y, in_first_scanline, in_last_scanline, vertical_coefficients ); + + ++vertical_contributors; + vertical_coefficients += stbir_info->vertical.coefficient_width; + } +} + +#define STBIR__FLOAT_EMPTY_MARKER 3.0e+38F +#define STBIR__FLOAT_BUFFER_IS_EMPTY(ptr) ((ptr)[0]==STBIR__FLOAT_EMPTY_MARKER) + +static void stbir__encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info) +{ + // evict a scanline out into the output buffer + float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index ); + + // dump the scanline out + stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), ring_buffer_entry, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // mark it as empty + ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER; + + // advance the first scanline + split_info->ring_buffer_first_scanline++; + if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries ) + split_info->ring_buffer_begin_index = 0; +} + +static void stbir__horizontal_resample_and_encode_first_scanline_from_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info) +{ + // evict a scanline out into the output buffer + + float* ring_buffer_entry = stbir__get_ring_buffer_entry(stbir_info, split_info, split_info->ring_buffer_begin_index ); + + // Now resample it into the buffer. + stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, ring_buffer_entry STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // dump the scanline out + stbir__encode_scanline( stbir_info, ( (char *)stbir_info->output_data ) + ( (size_t)split_info->ring_buffer_first_scanline * (size_t)stbir_info->output_stride_bytes ), split_info->vertical_buffer, split_info->ring_buffer_first_scanline STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // mark it as empty + ring_buffer_entry[ 0 ] = STBIR__FLOAT_EMPTY_MARKER; + + // advance the first scanline + split_info->ring_buffer_first_scanline++; + if ( ++split_info->ring_buffer_begin_index == stbir_info->ring_buffer_num_entries ) + split_info->ring_buffer_begin_index = 0; +} + +static void stbir__resample_vertical_scatter(stbir__info const * stbir_info, stbir__per_split_info* split_info, int n0, int n1, float const * vertical_coefficients, float const * vertical_buffer, float const * vertical_buffer_end ) +{ + STBIR_ASSERT( !stbir_info->vertical.is_gather ); + + STBIR_PROFILE_START( vertical ); + { + int k = 0, total = n1 - n0 + 1; + STBIR_ASSERT( total > 0 ); + do { + float * outputs[8]; + int i, n = total; if ( n > 8 ) n = 8; + for( i = 0 ; i < n ; i++ ) + { + outputs[ i ] = stbir__get_ring_buffer_scanline(stbir_info, split_info, k+i+n0 ); + if ( ( i ) && ( STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[i] ) != STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ) ) ) // make sure runs are of the same type + { + n = i; + break; + } + } + // call the scatter to N scanlines at a time function (up to 8 scanlines of scattering at once) + ((STBIR__FLOAT_BUFFER_IS_EMPTY( outputs[0] ))?stbir__vertical_scatter_sets:stbir__vertical_scatter_blends)[n-1]( outputs, vertical_coefficients + k, vertical_buffer, vertical_buffer_end ); + k += n; + total -= n; + } while ( total ); + } + + STBIR_PROFILE_END( vertical ); +} + +typedef void stbir__handle_scanline_for_scatter_func(stbir__info const * stbir_info, stbir__per_split_info* split_info); + +static void stbir__vertical_scatter_loop( stbir__info const * stbir_info, stbir__per_split_info* split_info, int split_count ) +{ + int y, start_output_y, end_output_y, start_input_y, end_input_y; + stbir__contributors* vertical_contributors = stbir_info->vertical.contributors; + float const * vertical_coefficients = stbir_info->vertical.coefficients; + stbir__handle_scanline_for_scatter_func * handle_scanline_for_scatter; + void * scanline_scatter_buffer; + void * scanline_scatter_buffer_end; + int on_first_input_y, last_input_y; + int width = (stbir_info->vertical_first) ? ( stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1 ) : stbir_info->horizontal.scale_info.output_sub_size; + int width_times_channels = stbir_info->effective_channels * width; + + STBIR_ASSERT( !stbir_info->vertical.is_gather ); + + start_output_y = split_info->start_output_y; + end_output_y = split_info[split_count-1].end_output_y; // may do multiple split counts + + start_input_y = split_info->start_input_y; + end_input_y = split_info[split_count-1].end_input_y; + + // adjust for starting offset start_input_y + y = start_input_y + stbir_info->vertical.filter_pixel_margin; + vertical_contributors += y ; + vertical_coefficients += stbir_info->vertical.coefficient_width * y; + + if ( stbir_info->vertical_first ) + { + handle_scanline_for_scatter = stbir__horizontal_resample_and_encode_first_scanline_from_scatter; + scanline_scatter_buffer = split_info->decode_buffer; + scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * (stbir_info->scanline_extents.conservative.n1-stbir_info->scanline_extents.conservative.n0+1); + } + else + { + handle_scanline_for_scatter = stbir__encode_first_scanline_from_scatter; + scanline_scatter_buffer = split_info->vertical_buffer; + scanline_scatter_buffer_end = ( (char*) scanline_scatter_buffer ) + sizeof( float ) * stbir_info->effective_channels * stbir_info->horizontal.scale_info.output_sub_size; + } + + // initialize the ring buffer for scattering + split_info->ring_buffer_first_scanline = start_output_y; + split_info->ring_buffer_last_scanline = -1; + split_info->ring_buffer_begin_index = -1; + + // mark all the buffers as empty to start + for( y = 0 ; y < stbir_info->ring_buffer_num_entries ; y++ ) + { + float * decode_buffer = stbir__get_ring_buffer_entry( stbir_info, split_info, y ); + decode_buffer[ width_times_channels ] = 0.0f; // clear two over for horizontals with a remnant of 3 + decode_buffer[ width_times_channels+1 ] = 0.0f; + decode_buffer[0] = STBIR__FLOAT_EMPTY_MARKER; // only used on scatter + } + + // do the loop in input space + on_first_input_y = 1; last_input_y = start_input_y; + for (y = start_input_y ; y < end_input_y; y++) + { + int out_first_scanline, out_last_scanline; + + out_first_scanline = vertical_contributors->n0; + out_last_scanline = vertical_contributors->n1; + + STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries); + + if ( ( out_last_scanline >= out_first_scanline ) && ( ( ( out_first_scanline >= start_output_y ) && ( out_first_scanline < end_output_y ) ) || ( ( out_last_scanline >= start_output_y ) && ( out_last_scanline < end_output_y ) ) ) ) + { + float const * vc = vertical_coefficients; + + // keep track of the range actually seen for the next resize + last_input_y = y; + if ( ( on_first_input_y ) && ( y > start_input_y ) ) + split_info->start_input_y = y; + on_first_input_y = 0; + + // clip the region + if ( out_first_scanline < start_output_y ) + { + vc += start_output_y - out_first_scanline; + out_first_scanline = start_output_y; + } + + if ( out_last_scanline >= end_output_y ) + out_last_scanline = end_output_y - 1; + + // if very first scanline, init the index + if (split_info->ring_buffer_begin_index < 0) + split_info->ring_buffer_begin_index = out_first_scanline - start_output_y; + + STBIR_ASSERT( split_info->ring_buffer_begin_index <= out_first_scanline ); + + // Decode the nth scanline from the source image into the decode buffer. + stbir__decode_scanline( stbir_info, y, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // When horizontal first, we resample horizontally into the vertical buffer before we scatter it out + if ( !stbir_info->vertical_first ) + stbir__resample_horizontal_gather( stbir_info, split_info->vertical_buffer, split_info->decode_buffer STBIR_ONLY_PROFILE_SET_SPLIT_INFO ); + + // Now it's sitting in the buffer ready to be distributed into the ring buffers. + + // evict from the ringbuffer, if we need are full + if ( ( ( split_info->ring_buffer_last_scanline - split_info->ring_buffer_first_scanline + 1 ) == stbir_info->ring_buffer_num_entries ) && + ( out_last_scanline > split_info->ring_buffer_last_scanline ) ) + handle_scanline_for_scatter( stbir_info, split_info ); + + // Now the horizontal buffer is ready to write to all ring buffer rows, so do it. + stbir__resample_vertical_scatter(stbir_info, split_info, out_first_scanline, out_last_scanline, vc, (float*)scanline_scatter_buffer, (float*)scanline_scatter_buffer_end ); + + // update the end of the buffer + if ( out_last_scanline > split_info->ring_buffer_last_scanline ) + split_info->ring_buffer_last_scanline = out_last_scanline; + } + ++vertical_contributors; + vertical_coefficients += stbir_info->vertical.coefficient_width; + } + + // now evict the scanlines that are left over in the ring buffer + while ( split_info->ring_buffer_first_scanline < end_output_y ) + handle_scanline_for_scatter(stbir_info, split_info); + + // update the end_input_y if we do multiple resizes with the same data + ++last_input_y; + for( y = 0 ; y < split_count; y++ ) + if ( split_info[y].end_input_y > last_input_y ) + split_info[y].end_input_y = last_input_y; +} + + +static stbir__kernel_callback * stbir__builtin_kernels[] = { 0, stbir__filter_trapezoid, stbir__filter_triangle, stbir__filter_cubic, stbir__filter_catmullrom, stbir__filter_mitchell, stbir__filter_point }; +static stbir__support_callback * stbir__builtin_supports[] = { 0, stbir__support_trapezoid, stbir__support_one, stbir__support_two, stbir__support_two, stbir__support_two, stbir__support_zeropoint5 }; + +static void stbir__set_sampler(stbir__sampler * samp, stbir_filter filter, stbir__kernel_callback * kernel, stbir__support_callback * support, stbir_edge edge, stbir__scale_info * scale_info, int always_gather, void * user_data ) +{ + // set filter + if (filter == 0) + { + filter = STBIR_DEFAULT_FILTER_DOWNSAMPLE; // default to downsample + if (scale_info->scale >= ( 1.0f - stbir__small_float ) ) + { + if ( (scale_info->scale <= ( 1.0f + stbir__small_float ) ) && ( STBIR_CEILF(scale_info->pixel_shift) == scale_info->pixel_shift ) ) + filter = STBIR_FILTER_POINT_SAMPLE; + else + filter = STBIR_DEFAULT_FILTER_UPSAMPLE; + } + } + samp->filter_enum = filter; + + STBIR_ASSERT(samp->filter_enum != 0); + STBIR_ASSERT((unsigned)samp->filter_enum < STBIR_FILTER_OTHER); + samp->filter_kernel = stbir__builtin_kernels[ filter ]; + samp->filter_support = stbir__builtin_supports[ filter ]; + + if ( kernel && support ) + { + samp->filter_kernel = kernel; + samp->filter_support = support; + samp->filter_enum = STBIR_FILTER_OTHER; + } + + samp->edge = edge; + samp->filter_pixel_width = stbir__get_filter_pixel_width (samp->filter_support, scale_info->scale, user_data ); + // Gather is always better, but in extreme downsamples, you have to most or all of the data in memory + // For horizontal, we always have all the pixels, so we always use gather here (always_gather==1). + // For vertical, we use gather if scaling up (which means we will have samp->filter_pixel_width + // scanlines in memory at once). + samp->is_gather = 0; + if ( scale_info->scale >= ( 1.0f - stbir__small_float ) ) + samp->is_gather = 1; + else if ( ( always_gather ) || ( samp->filter_pixel_width <= STBIR_FORCE_GATHER_FILTER_SCANLINES_AMOUNT ) ) + samp->is_gather = 2; + + // pre calculate stuff based on the above + samp->coefficient_width = stbir__get_coefficient_width(samp, samp->is_gather, user_data); + + // filter_pixel_width is the conservative size in pixels of input that affect an output pixel. + // In rare cases (only with 2 pix to 1 pix with the default filters), it's possible that the + // filter will extend before or after the scanline beyond just one extra entire copy of the + // scanline (we would hit the edge twice). We don't let you do that, so we clamp the total + // width to 3x the total of input pixel (once for the scanline, once for the left side + // overhang, and once for the right side). We only do this for edge mode, since the other + // modes can just re-edge clamp back in again. + if ( edge == STBIR_EDGE_WRAP ) + if ( samp->filter_pixel_width > ( scale_info->input_full_size * 3 ) ) + samp->filter_pixel_width = scale_info->input_full_size * 3; + + // This is how much to expand buffers to account for filters seeking outside + // the image boundaries. + samp->filter_pixel_margin = samp->filter_pixel_width / 2; + + // filter_pixel_margin is the amount that this filter can overhang on just one side of either + // end of the scanline (left or the right). Since we only allow you to overhang 1 scanline's + // worth of pixels, we clamp this one side of overhang to the input scanline size. Again, + // this clamping only happens in rare cases with the default filters (2 pix to 1 pix). + if ( edge == STBIR_EDGE_WRAP ) + if ( samp->filter_pixel_margin > scale_info->input_full_size ) + samp->filter_pixel_margin = scale_info->input_full_size; + + samp->num_contributors = stbir__get_contributors(samp, samp->is_gather); + + samp->contributors_size = samp->num_contributors * sizeof(stbir__contributors); + samp->coefficients_size = samp->num_contributors * samp->coefficient_width * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra sizeof(float) is padding + + samp->gather_prescatter_contributors = 0; + samp->gather_prescatter_coefficients = 0; + if ( samp->is_gather == 0 ) + { + samp->gather_prescatter_coefficient_width = samp->filter_pixel_width; + samp->gather_prescatter_num_contributors = stbir__get_contributors(samp, 2); + samp->gather_prescatter_contributors_size = samp->gather_prescatter_num_contributors * sizeof(stbir__contributors); + samp->gather_prescatter_coefficients_size = samp->gather_prescatter_num_contributors * samp->gather_prescatter_coefficient_width * sizeof(float); + } +} + +static void stbir__get_conservative_extents( stbir__sampler * samp, stbir__contributors * range, void * user_data ) +{ + float scale = samp->scale_info.scale; + float out_shift = samp->scale_info.pixel_shift; + stbir__support_callback * support = samp->filter_support; + int input_full_size = samp->scale_info.input_full_size; + stbir_edge edge = samp->edge; + float inv_scale = samp->scale_info.inv_scale; + + STBIR_ASSERT( samp->is_gather != 0 ); + + if ( samp->is_gather == 1 ) + { + int in_first_pixel, in_last_pixel; + float out_filter_radius = support(inv_scale, user_data) * scale; + + stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0.5, out_filter_radius, inv_scale, out_shift, input_full_size, edge ); + range->n0 = in_first_pixel; + stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, ( (float)(samp->scale_info.output_sub_size-1) ) + 0.5f, out_filter_radius, inv_scale, out_shift, input_full_size, edge ); + range->n1 = in_last_pixel; + } + else if ( samp->is_gather == 2 ) // downsample gather, refine + { + float in_pixels_radius = support(scale, user_data) * inv_scale; + int filter_pixel_margin = samp->filter_pixel_margin; + int output_sub_size = samp->scale_info.output_sub_size; + int input_end; + int n; + int in_first_pixel, in_last_pixel; + + // get a conservative area of the input range + stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, 0, 0, inv_scale, out_shift, input_full_size, edge ); + range->n0 = in_first_pixel; + stbir__calculate_in_pixel_range( &in_first_pixel, &in_last_pixel, (float)output_sub_size, 0, inv_scale, out_shift, input_full_size, edge ); + range->n1 = in_last_pixel; + + // now go through the margin to the start of area to find bottom + n = range->n0 + 1; + input_end = -filter_pixel_margin; + while( n >= input_end ) + { + int out_first_pixel, out_last_pixel; + stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size ); + if ( out_first_pixel > out_last_pixel ) + break; + + if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) ) + range->n0 = n; + --n; + } + + // now go through the end of the area through the margin to find top + n = range->n1 - 1; + input_end = n + 1 + filter_pixel_margin; + while( n <= input_end ) + { + int out_first_pixel, out_last_pixel; + stbir__calculate_out_pixel_range( &out_first_pixel, &out_last_pixel, ((float)n)+0.5f, in_pixels_radius, scale, out_shift, output_sub_size ); + if ( out_first_pixel > out_last_pixel ) + break; + if ( ( out_first_pixel < output_sub_size ) || ( out_last_pixel >= 0 ) ) + range->n1 = n; + ++n; + } + } + + if ( samp->edge == STBIR_EDGE_WRAP ) + { + // if we are wrapping, and we are very close to the image size (so the edges might merge), just use the scanline up to the edge + if ( ( range->n0 > 0 ) && ( range->n1 >= input_full_size ) ) + { + int marg = range->n1 - input_full_size + 1; + if ( ( marg + STBIR__MERGE_RUNS_PIXEL_THRESHOLD ) >= range->n0 ) + range->n0 = 0; + } + if ( ( range->n0 < 0 ) && ( range->n1 < (input_full_size-1) ) ) + { + int marg = -range->n0; + if ( ( input_full_size - marg - STBIR__MERGE_RUNS_PIXEL_THRESHOLD - 1 ) <= range->n1 ) + range->n1 = input_full_size - 1; + } + } + else + { + // for non-edge-wrap modes, we never read over the edge, so clamp + if ( range->n0 < 0 ) + range->n0 = 0; + if ( range->n1 >= input_full_size ) + range->n1 = input_full_size - 1; + } +} + +static void stbir__get_split_info( stbir__per_split_info* split_info, int splits, int output_height, int vertical_pixel_margin, int input_full_height, int is_gather, stbir__contributors * contribs ) +{ + int i, cur; + int left = output_height; + + cur = 0; + for( i = 0 ; i < splits ; i++ ) + { + int each; + + split_info[i].start_output_y = cur; + each = left / ( splits - i ); + split_info[i].end_output_y = cur + each; + + // ok, when we are gathering, we need to make sure we are starting on a y offset that doesn't have + // a "special" set of coefficients. Basically, with exactly the right filter at exactly the right + // resize at exactly the right phase, some of the coefficents can be zero. When they are zero, we + // don't process them at all. But this leads to a tricky thing with the thread splits, where we + // might have a set of two coeffs like this for example: (4,4) and (3,6). The 4,4 means there was + // just one single coeff because things worked out perfectly (normally, they all have 4 coeffs + // like the range 3,6. The problem is that if we start right on the (4,4) on a brand new thread, + // then when we get to (3,6), we don't have the "3" sample in memory (because we didn't load + // it on the initial (4,4) range because it didn't have a 3 (we only add new samples that are + // larger than our existing samples - it's just how the eviction works). So, our solution here + // is pretty simple, if we start right on a range that has samples that start earlier, then we + // simply bump up our previous thread split range to include it, and then start this threads + // range with the smaller sample. It just moves one scanline from one thread split to another, + // so that we end with the unusual one, instead of start with it. To do this, we check 2-4 + // sample at each thread split start and then occassionally move them. + + if ( ( is_gather ) && ( i ) ) + { + stbir__contributors * small_contribs; + int j, smallest, stop, start_n0; + stbir__contributors * split_contribs = contribs + cur; + + // scan for a max of 3x the filter width or until the next thread split + stop = vertical_pixel_margin * 3; + if ( each < stop ) + stop = each; + + // loops a few times before early out + smallest = 0; + small_contribs = split_contribs; + start_n0 = small_contribs->n0; + for( j = 1 ; j <= stop ; j++ ) + { + ++split_contribs; + if ( split_contribs->n0 > start_n0 ) + break; + if ( split_contribs->n0 < small_contribs->n0 ) + { + small_contribs = split_contribs; + smallest = j; + } + } + + split_info[i-1].end_output_y += smallest; + split_info[i].start_output_y += smallest; + } + + cur += each; + left -= each; + + // scatter range (updated to minimum as you run it) + split_info[i].start_input_y = -vertical_pixel_margin; + split_info[i].end_input_y = input_full_height + vertical_pixel_margin; + } +} + +static void stbir__free_internal_mem( stbir__info *info ) +{ + #define STBIR__FREE_AND_CLEAR( ptr ) { if ( ptr ) { void * p = (ptr); (ptr) = 0; STBIR_FREE( p, info->user_data); } } + + if ( info ) + { + #ifndef STBIR__SEPARATE_ALLOCATIONS + STBIR__FREE_AND_CLEAR( info->alloced_mem ); + #else + int i,j; + + if ( ( info->vertical.gather_prescatter_contributors ) && ( (void*)info->vertical.gather_prescatter_contributors != (void*)info->split_info[0].decode_buffer ) ) + { + STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_coefficients ); + STBIR__FREE_AND_CLEAR( info->vertical.gather_prescatter_contributors ); + } + for( i = 0 ; i < info->splits ; i++ ) + { + for( j = 0 ; j < info->alloc_ring_buffer_num_entries ; j++ ) + { + #ifdef STBIR_SIMD8 + if ( info->effective_channels == 3 ) + --info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer + #endif + STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers[j] ); + } + + #ifdef STBIR_SIMD8 + if ( info->effective_channels == 3 ) + --info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer + #endif + STBIR__FREE_AND_CLEAR( info->split_info[i].decode_buffer ); + STBIR__FREE_AND_CLEAR( info->split_info[i].ring_buffers ); + STBIR__FREE_AND_CLEAR( info->split_info[i].vertical_buffer ); + } + STBIR__FREE_AND_CLEAR( info->split_info ); + if ( info->vertical.coefficients != info->horizontal.coefficients ) + { + STBIR__FREE_AND_CLEAR( info->vertical.coefficients ); + STBIR__FREE_AND_CLEAR( info->vertical.contributors ); + } + STBIR__FREE_AND_CLEAR( info->horizontal.coefficients ); + STBIR__FREE_AND_CLEAR( info->horizontal.contributors ); + STBIR__FREE_AND_CLEAR( info->alloced_mem ); + STBIR_FREE( info, info->user_data ); + #endif + } + + #undef STBIR__FREE_AND_CLEAR +} + +static int stbir__get_max_split( int splits, int height ) +{ + int i; + int max = 0; + + for( i = 0 ; i < splits ; i++ ) + { + int each = height / ( splits - i ); + if ( each > max ) + max = each; + height -= each; + } + return max; +} + +static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_n_coeffs_funcs[8] = +{ + 0, stbir__horizontal_gather_1_channels_with_n_coeffs_funcs, stbir__horizontal_gather_2_channels_with_n_coeffs_funcs, stbir__horizontal_gather_3_channels_with_n_coeffs_funcs, stbir__horizontal_gather_4_channels_with_n_coeffs_funcs, 0,0, stbir__horizontal_gather_7_channels_with_n_coeffs_funcs +}; + +static stbir__horizontal_gather_channels_func ** stbir__horizontal_gather_channels_funcs[8] = +{ + 0, stbir__horizontal_gather_1_channels_funcs, stbir__horizontal_gather_2_channels_funcs, stbir__horizontal_gather_3_channels_funcs, stbir__horizontal_gather_4_channels_funcs, 0,0, stbir__horizontal_gather_7_channels_funcs +}; + +// there are six resize classifications: 0 == vertical scatter, 1 == vertical gather < 1x scale, 2 == vertical gather 1x-2x scale, 4 == vertical gather < 3x scale, 4 == vertical gather > 3x scale, 5 == <=4 pixel height, 6 == <=4 pixel wide column +#define STBIR_RESIZE_CLASSIFICATIONS 8 + +static float stbir__compute_weights[5][STBIR_RESIZE_CLASSIFICATIONS][4]= // 5 = 0=1chan, 1=2chan, 2=3chan, 3=4chan, 4=7chan +{ + { + { 1.00000f, 1.00000f, 0.31250f, 1.00000f }, + { 0.56250f, 0.59375f, 0.00000f, 0.96875f }, + { 1.00000f, 0.06250f, 0.00000f, 1.00000f }, + { 0.00000f, 0.09375f, 1.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.03125f, 0.12500f, 1.00000f, 1.00000f }, + { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 0.00000f, 1.00000f, 0.00000f, 0.03125f }, + }, { + { 0.00000f, 0.84375f, 0.00000f, 0.03125f }, + { 0.09375f, 0.93750f, 0.00000f, 0.78125f }, + { 0.87500f, 0.21875f, 0.00000f, 0.96875f }, + { 0.09375f, 0.09375f, 1.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.03125f, 0.12500f, 1.00000f, 1.00000f }, + { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 0.00000f, 1.00000f, 0.00000f, 0.53125f }, + }, { + { 0.00000f, 0.53125f, 0.00000f, 0.03125f }, + { 0.06250f, 0.96875f, 0.00000f, 0.53125f }, + { 0.87500f, 0.18750f, 0.00000f, 0.93750f }, + { 0.00000f, 0.09375f, 1.00000f, 1.00000f }, + { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.03125f, 0.12500f, 1.00000f, 1.00000f }, + { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 0.00000f, 1.00000f, 0.00000f, 0.56250f }, + }, { + { 0.00000f, 0.50000f, 0.00000f, 0.71875f }, + { 0.06250f, 0.84375f, 0.00000f, 0.87500f }, + { 1.00000f, 0.50000f, 0.50000f, 0.96875f }, + { 1.00000f, 0.09375f, 0.31250f, 0.50000f }, + { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 1.00000f, 0.03125f, 0.03125f, 0.53125f }, + { 0.18750f, 0.12500f, 0.00000f, 1.00000f }, + { 0.00000f, 1.00000f, 0.03125f, 0.18750f }, + }, { + { 0.00000f, 0.59375f, 0.00000f, 0.96875f }, + { 0.06250f, 0.81250f, 0.06250f, 0.59375f }, + { 0.75000f, 0.43750f, 0.12500f, 0.96875f }, + { 0.87500f, 0.06250f, 0.18750f, 0.43750f }, + { 1.00000f, 1.00000f, 1.00000f, 1.00000f }, + { 0.15625f, 0.12500f, 1.00000f, 1.00000f }, + { 0.06250f, 0.12500f, 0.00000f, 1.00000f }, + { 0.00000f, 1.00000f, 0.03125f, 0.34375f }, + } +}; + +// structure that allow us to query and override info for training the costs +typedef struct STBIR__V_FIRST_INFO +{ + double v_cost, h_cost; + int control_v_first; // 0 = no control, 1 = force hori, 2 = force vert + int v_first; + int v_resize_classification; + int is_gather; +} STBIR__V_FIRST_INFO; + +#ifdef STBIR__V_FIRST_INFO_BUFFER +static STBIR__V_FIRST_INFO STBIR__V_FIRST_INFO_BUFFER = {0}; +#define STBIR__V_FIRST_INFO_POINTER &STBIR__V_FIRST_INFO_BUFFER +#else +#define STBIR__V_FIRST_INFO_POINTER 0 +#endif + +// Figure out whether to scale along the horizontal or vertical first. +// This only *super* important when you are scaling by a massively +// different amount in the vertical vs the horizontal (for example, if +// you are scaling by 2x in the width, and 0.5x in the height, then you +// want to do the vertical scale first, because it's around 3x faster +// in that order. +// +// In more normal circumstances, this makes a 20-40% differences, so +// it's good to get right, but not critical. The normal way that you +// decide which direction goes first is just figuring out which +// direction does more multiplies. But with modern CPUs with their +// fancy caches and SIMD and high IPC abilities, so there's just a lot +// more that goes into it. +// +// My handwavy sort of solution is to have an app that does a whole +// bunch of timing for both vertical and horizontal first modes, +// and then another app that can read lots of these timing files +// and try to search for the best weights to use. Dotimings.c +// is the app that does a bunch of timings, and vf_train.c is the +// app that solves for the best weights (and shows how well it +// does currently). + +static int stbir__should_do_vertical_first( float weights_table[STBIR_RESIZE_CLASSIFICATIONS][4], int horizontal_filter_pixel_width, float horizontal_scale, int horizontal_output_size, int vertical_filter_pixel_width, float vertical_scale, int vertical_output_size, int is_gather, STBIR__V_FIRST_INFO * info ) +{ + double v_cost, h_cost; + float * weights; + int vertical_first; + int v_classification; + + // categorize the resize into buckets + if ( ( vertical_output_size <= 4 ) || ( horizontal_output_size <= 4 ) ) + v_classification = ( vertical_output_size < horizontal_output_size ) ? 6 : 7; + else if ( vertical_scale <= 1.0f ) + v_classification = ( is_gather ) ? 1 : 0; + else if ( vertical_scale <= 2.0f) + v_classification = 2; + else if ( vertical_scale <= 3.0f) + v_classification = 3; + else if ( vertical_scale <= 4.0f) + v_classification = 5; + else + v_classification = 6; + + // use the right weights + weights = weights_table[ v_classification ]; + + // this is the costs when you don't take into account modern CPUs with high ipc and simd and caches - wish we had a better estimate + h_cost = (float)horizontal_filter_pixel_width * weights[0] + horizontal_scale * (float)vertical_filter_pixel_width * weights[1]; + v_cost = (float)vertical_filter_pixel_width * weights[2] + vertical_scale * (float)horizontal_filter_pixel_width * weights[3]; + + // use computation estimate to decide vertical first or not + vertical_first = ( v_cost <= h_cost ) ? 1 : 0; + + // save these, if requested + if ( info ) + { + info->h_cost = h_cost; + info->v_cost = v_cost; + info->v_resize_classification = v_classification; + info->v_first = vertical_first; + info->is_gather = is_gather; + } + + // and this allows us to override everything for testing (see dotiming.c) + if ( ( info ) && ( info->control_v_first ) ) + vertical_first = ( info->control_v_first == 2 ) ? 1 : 0; + + return vertical_first; +} + +// layout lookups - must match stbir_internal_pixel_layout +static unsigned char stbir__pixel_channels[] = { + 1,2,3,3,4, // 1ch, 2ch, rgb, bgr, 4ch + 4,4,4,4,2,2, // RGBA,BGRA,ARGB,ABGR,RA,AR + 4,4,4,4,2,2, // RGBA_PM,BGRA_PM,ARGB_PM,ABGR_PM,RA_PM,AR_PM +}; + +// the internal pixel layout enums are in a different order, so we can easily do range comparisons of types +// the public pixel layout is ordered in a way that if you cast num_channels (1-4) to the enum, you get something sensible +static stbir_internal_pixel_layout stbir__pixel_layout_convert_public_to_internal[] = { + STBIRI_BGR, STBIRI_1CHANNEL, STBIRI_2CHANNEL, STBIRI_RGB, STBIRI_RGBA, + STBIRI_4CHANNEL, STBIRI_BGRA, STBIRI_ARGB, STBIRI_ABGR, STBIRI_RA, STBIRI_AR, + STBIRI_RGBA_PM, STBIRI_BGRA_PM, STBIRI_ARGB_PM, STBIRI_ABGR_PM, STBIRI_RA_PM, STBIRI_AR_PM, +}; + +static stbir__info * stbir__alloc_internal_mem_and_build_samplers( stbir__sampler * horizontal, stbir__sampler * vertical, stbir__contributors * conservative, stbir_pixel_layout input_pixel_layout_public, stbir_pixel_layout output_pixel_layout_public, int splits, int new_x, int new_y, int fast_alpha, void * user_data STBIR_ONLY_PROFILE_BUILD_GET_INFO ) +{ + static char stbir_channel_count_index[8]={ 9,0,1,2, 3,9,9,4 }; + + stbir__info * info = 0; + void * alloced = 0; + size_t alloced_total = 0; + int vertical_first; + size_t decode_buffer_size, ring_buffer_length_bytes, ring_buffer_size, vertical_buffer_size; + int alloc_ring_buffer_num_entries; + + int alpha_weighting_type = 0; // 0=none, 1=simple, 2=fancy + int conservative_split_output_size = stbir__get_max_split( splits, vertical->scale_info.output_sub_size ); + stbir_internal_pixel_layout input_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ input_pixel_layout_public ]; + stbir_internal_pixel_layout output_pixel_layout = stbir__pixel_layout_convert_public_to_internal[ output_pixel_layout_public ]; + int channels = stbir__pixel_channels[ input_pixel_layout ]; + int effective_channels = channels; + + // first figure out what type of alpha weighting to use (if any) + if ( ( horizontal->filter_enum != STBIR_FILTER_POINT_SAMPLE ) || ( vertical->filter_enum != STBIR_FILTER_POINT_SAMPLE ) ) // no alpha weighting on point sampling + { + if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) ) + { + if ( fast_alpha ) + { + alpha_weighting_type = 4; + } + else + { + static int fancy_alpha_effective_cnts[6] = { 7, 7, 7, 7, 3, 3 }; + alpha_weighting_type = 2; + effective_channels = fancy_alpha_effective_cnts[ input_pixel_layout - STBIRI_RGBA ]; + } + } + else if ( ( input_pixel_layout >= STBIRI_RGBA_PM ) && ( input_pixel_layout <= STBIRI_AR_PM ) && ( output_pixel_layout >= STBIRI_RGBA ) && ( output_pixel_layout <= STBIRI_AR ) ) + { + // input premult, output non-premult + alpha_weighting_type = 3; + } + else if ( ( input_pixel_layout >= STBIRI_RGBA ) && ( input_pixel_layout <= STBIRI_AR ) && ( output_pixel_layout >= STBIRI_RGBA_PM ) && ( output_pixel_layout <= STBIRI_AR_PM ) ) + { + // input non-premult, output premult + alpha_weighting_type = 1; + } + } + + // channel in and out count must match currently + if ( channels != stbir__pixel_channels[ output_pixel_layout ] ) + return 0; + + // get vertical first + vertical_first = stbir__should_do_vertical_first( stbir__compute_weights[ (int)stbir_channel_count_index[ effective_channels ] ], horizontal->filter_pixel_width, horizontal->scale_info.scale, horizontal->scale_info.output_sub_size, vertical->filter_pixel_width, vertical->scale_info.scale, vertical->scale_info.output_sub_size, vertical->is_gather, STBIR__V_FIRST_INFO_POINTER ); + + // sometimes read one float off in some of the unrolled loops (with a weight of zero coeff, so it doesn't have an effect) + // we use a few extra floats instead of just 1, so that input callback buffer can overlap with the decode buffer without + // the conversion routines overwriting the callback input data. + decode_buffer_size = ( conservative->n1 - conservative->n0 + 1 ) * effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for input callback stagger + +#if defined( STBIR__SEPARATE_ALLOCATIONS ) && defined(STBIR_SIMD8) + if ( effective_channels == 3 ) + decode_buffer_size += sizeof(float); // avx in 3 channel mode needs one float at the start of the buffer (only with separate allocations) +#endif + + ring_buffer_length_bytes = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float)*STBIR_INPUT_CALLBACK_PADDING; // extra floats for padding + + // if we do vertical first, the ring buffer holds a whole decoded line + if ( vertical_first ) + ring_buffer_length_bytes = ( decode_buffer_size + 15 ) & ~15; + + if ( ( ring_buffer_length_bytes & 4095 ) == 0 ) ring_buffer_length_bytes += 64*3; // avoid 4k alias + + // One extra entry because floating point precision problems sometimes cause an extra to be necessary. + alloc_ring_buffer_num_entries = vertical->filter_pixel_width + 1; + + // we never need more ring buffer entries than the scanlines we're outputting when in scatter mode + if ( ( !vertical->is_gather ) && ( alloc_ring_buffer_num_entries > conservative_split_output_size ) ) + alloc_ring_buffer_num_entries = conservative_split_output_size; + + ring_buffer_size = (size_t)alloc_ring_buffer_num_entries * (size_t)ring_buffer_length_bytes; + + // The vertical buffer is used differently, depending on whether we are scattering + // the vertical scanlines, or gathering them. + // If scattering, it's used at the temp buffer to accumulate each output. + // If gathering, it's just the output buffer. + vertical_buffer_size = (size_t)horizontal->scale_info.output_sub_size * (size_t)effective_channels * sizeof(float) + sizeof(float); // extra float for padding + + // we make two passes through this loop, 1st to add everything up, 2nd to allocate and init + for(;;) + { + int i; + void * advance_mem = alloced; + int copy_horizontal = 0; + stbir__sampler * possibly_use_horizontal_for_pivot = 0; + +#ifdef STBIR__SEPARATE_ALLOCATIONS + #define STBIR__NEXT_PTR( ptr, size, ntype ) if ( alloced ) { void * p = STBIR_MALLOC( size, user_data); if ( p == 0 ) { stbir__free_internal_mem( info ); return 0; } (ptr) = (ntype*)p; } +#else + #define STBIR__NEXT_PTR( ptr, size, ntype ) advance_mem = (void*) ( ( ((size_t)advance_mem) + 15 ) & ~15 ); if ( alloced ) ptr = (ntype*)advance_mem; advance_mem = (char*)(((size_t)advance_mem) + (size)); +#endif + + STBIR__NEXT_PTR( info, sizeof( stbir__info ), stbir__info ); + + STBIR__NEXT_PTR( info->split_info, sizeof( stbir__per_split_info ) * splits, stbir__per_split_info ); + + if ( info ) + { + static stbir__alpha_weight_func * fancy_alpha_weights[6] = { stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_4ch, stbir__fancy_alpha_weight_2ch, stbir__fancy_alpha_weight_2ch }; + static stbir__alpha_unweight_func * fancy_alpha_unweights[6] = { stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_4ch, stbir__fancy_alpha_unweight_2ch, stbir__fancy_alpha_unweight_2ch }; + static stbir__alpha_weight_func * simple_alpha_weights[6] = { stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_4ch, stbir__simple_alpha_weight_2ch, stbir__simple_alpha_weight_2ch }; + static stbir__alpha_unweight_func * simple_alpha_unweights[6] = { stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_4ch, stbir__simple_alpha_unweight_2ch, stbir__simple_alpha_unweight_2ch }; + + // initialize info fields + info->alloced_mem = alloced; + info->alloced_total = alloced_total; + + info->channels = channels; + info->effective_channels = effective_channels; + + info->offset_x = new_x; + info->offset_y = new_y; + info->alloc_ring_buffer_num_entries = (int)alloc_ring_buffer_num_entries; + info->ring_buffer_num_entries = 0; + info->ring_buffer_length_bytes = (int)ring_buffer_length_bytes; + info->splits = splits; + info->vertical_first = vertical_first; + + info->input_pixel_layout_internal = input_pixel_layout; + info->output_pixel_layout_internal = output_pixel_layout; + + // setup alpha weight functions + info->alpha_weight = 0; + info->alpha_unweight = 0; + + // handle alpha weighting functions and overrides + if ( alpha_weighting_type == 2 ) + { + // high quality alpha multiplying on the way in, dividing on the way out + info->alpha_weight = fancy_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; + info->alpha_unweight = fancy_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ]; + } + else if ( alpha_weighting_type == 4 ) + { + // fast alpha multiplying on the way in, dividing on the way out + info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; + info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ]; + } + else if ( alpha_weighting_type == 1 ) + { + // fast alpha on the way in, leave in premultiplied form on way out + info->alpha_weight = simple_alpha_weights[ input_pixel_layout - STBIRI_RGBA ]; + } + else if ( alpha_weighting_type == 3 ) + { + // incoming is premultiplied, fast alpha dividing on the way out - non-premultiplied output + info->alpha_unweight = simple_alpha_unweights[ output_pixel_layout - STBIRI_RGBA ]; + } + + // handle 3-chan color flipping, using the alpha weight path + if ( ( ( input_pixel_layout == STBIRI_RGB ) && ( output_pixel_layout == STBIRI_BGR ) ) || + ( ( input_pixel_layout == STBIRI_BGR ) && ( output_pixel_layout == STBIRI_RGB ) ) ) + { + // do the flipping on the smaller of the two ends + if ( horizontal->scale_info.scale < 1.0f ) + info->alpha_unweight = stbir__simple_flip_3ch; + else + info->alpha_weight = stbir__simple_flip_3ch; + } + + } + + // get all the per-split buffers + for( i = 0 ; i < splits ; i++ ) + { + STBIR__NEXT_PTR( info->split_info[i].decode_buffer, decode_buffer_size, float ); + +#ifdef STBIR__SEPARATE_ALLOCATIONS + + #ifdef STBIR_SIMD8 + if ( ( info ) && ( effective_channels == 3 ) ) + ++info->split_info[i].decode_buffer; // avx in 3 channel mode needs one float at the start of the buffer + #endif + + STBIR__NEXT_PTR( info->split_info[i].ring_buffers, alloc_ring_buffer_num_entries * sizeof(float*), float* ); + { + int j; + for( j = 0 ; j < alloc_ring_buffer_num_entries ; j++ ) + { + STBIR__NEXT_PTR( info->split_info[i].ring_buffers[j], ring_buffer_length_bytes, float ); + #ifdef STBIR_SIMD8 + if ( ( info ) && ( effective_channels == 3 ) ) + ++info->split_info[i].ring_buffers[j]; // avx in 3 channel mode needs one float at the start of the buffer + #endif + } + } +#else + STBIR__NEXT_PTR( info->split_info[i].ring_buffer, ring_buffer_size, float ); +#endif + STBIR__NEXT_PTR( info->split_info[i].vertical_buffer, vertical_buffer_size, float ); + } + + // alloc memory for to-be-pivoted coeffs (if necessary) + if ( vertical->is_gather == 0 ) + { + size_t both; + size_t temp_mem_amt; + + // when in vertical scatter mode, we first build the coefficients in gather mode, and then pivot after, + // that means we need two buffers, so we try to use the decode buffer and ring buffer for this. if that + // is too small, we just allocate extra memory to use as this temp. + + both = (size_t)vertical->gather_prescatter_contributors_size + (size_t)vertical->gather_prescatter_coefficients_size; + +#ifdef STBIR__SEPARATE_ALLOCATIONS + temp_mem_amt = decode_buffer_size; + + #ifdef STBIR_SIMD8 + if ( effective_channels == 3 ) + --temp_mem_amt; // avx in 3 channel mode needs one float at the start of the buffer + #endif +#else + temp_mem_amt = (size_t)( decode_buffer_size + ring_buffer_size + vertical_buffer_size ) * (size_t)splits; +#endif + if ( temp_mem_amt >= both ) + { + if ( info ) + { + vertical->gather_prescatter_contributors = (stbir__contributors*)info->split_info[0].decode_buffer; + vertical->gather_prescatter_coefficients = (float*) ( ( (char*)info->split_info[0].decode_buffer ) + vertical->gather_prescatter_contributors_size ); + } + } + else + { + // ring+decode memory is too small, so allocate temp memory + STBIR__NEXT_PTR( vertical->gather_prescatter_contributors, vertical->gather_prescatter_contributors_size, stbir__contributors ); + STBIR__NEXT_PTR( vertical->gather_prescatter_coefficients, vertical->gather_prescatter_coefficients_size, float ); + } + } + + STBIR__NEXT_PTR( horizontal->contributors, horizontal->contributors_size, stbir__contributors ); + STBIR__NEXT_PTR( horizontal->coefficients, horizontal->coefficients_size, float ); + + // are the two filters identical?? (happens a lot with mipmap generation) + if ( ( horizontal->filter_kernel == vertical->filter_kernel ) && ( horizontal->filter_support == vertical->filter_support ) && ( horizontal->edge == vertical->edge ) && ( horizontal->scale_info.output_sub_size == vertical->scale_info.output_sub_size ) ) + { + float diff_scale = horizontal->scale_info.scale - vertical->scale_info.scale; + float diff_shift = horizontal->scale_info.pixel_shift - vertical->scale_info.pixel_shift; + if ( diff_scale < 0.0f ) diff_scale = -diff_scale; + if ( diff_shift < 0.0f ) diff_shift = -diff_shift; + if ( ( diff_scale <= stbir__small_float ) && ( diff_shift <= stbir__small_float ) ) + { + if ( horizontal->is_gather == vertical->is_gather ) + { + copy_horizontal = 1; + goto no_vert_alloc; + } + // everything matches, but vertical is scatter, horizontal is gather, use horizontal coeffs for vertical pivot coeffs + possibly_use_horizontal_for_pivot = horizontal; + } + } + + STBIR__NEXT_PTR( vertical->contributors, vertical->contributors_size, stbir__contributors ); + STBIR__NEXT_PTR( vertical->coefficients, vertical->coefficients_size, float ); + + no_vert_alloc: + + if ( info ) + { + STBIR_PROFILE_BUILD_START( horizontal ); + + stbir__calculate_filters( horizontal, 0, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO ); + + // setup the horizontal gather functions + // start with defaulting to the n_coeffs functions (specialized on channels and remnant leftover) + info->horizontal_gather_channels = stbir__horizontal_gather_n_coeffs_funcs[ effective_channels ][ horizontal->extent_info.widest & 3 ]; + // but if the number of coeffs <= 12, use another set of special cases. <=12 coeffs is any enlarging resize, or shrinking resize down to about 1/3 size + if ( horizontal->extent_info.widest <= 12 ) + info->horizontal_gather_channels = stbir__horizontal_gather_channels_funcs[ effective_channels ][ horizontal->extent_info.widest - 1 ]; + + info->scanline_extents.conservative.n0 = conservative->n0; + info->scanline_extents.conservative.n1 = conservative->n1; + + // get exact extents + stbir__get_extents( horizontal, &info->scanline_extents ); + + // pack the horizontal coeffs + horizontal->coefficient_width = stbir__pack_coefficients(horizontal->num_contributors, horizontal->contributors, horizontal->coefficients, horizontal->coefficient_width, horizontal->extent_info.widest, info->scanline_extents.conservative.n0, info->scanline_extents.conservative.n1 ); + + STBIR_MEMCPY( &info->horizontal, horizontal, sizeof( stbir__sampler ) ); + + STBIR_PROFILE_BUILD_END( horizontal ); + + if ( copy_horizontal ) + { + STBIR_MEMCPY( &info->vertical, horizontal, sizeof( stbir__sampler ) ); + } + else + { + STBIR_PROFILE_BUILD_START( vertical ); + + stbir__calculate_filters( vertical, possibly_use_horizontal_for_pivot, user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO ); + STBIR_MEMCPY( &info->vertical, vertical, sizeof( stbir__sampler ) ); + + STBIR_PROFILE_BUILD_END( vertical ); + } + + // setup the vertical split ranges + stbir__get_split_info( info->split_info, info->splits, info->vertical.scale_info.output_sub_size, info->vertical.filter_pixel_margin, info->vertical.scale_info.input_full_size, info->vertical.is_gather, info->vertical.contributors ); + + // now we know precisely how many entries we need + info->ring_buffer_num_entries = info->vertical.extent_info.widest; + + // we never need more ring buffer entries than the scanlines we're outputting + if ( ( !info->vertical.is_gather ) && ( info->ring_buffer_num_entries > conservative_split_output_size ) ) + info->ring_buffer_num_entries = conservative_split_output_size; + STBIR_ASSERT( info->ring_buffer_num_entries <= info->alloc_ring_buffer_num_entries ); + } + #undef STBIR__NEXT_PTR + + + // is this the first time through loop? + if ( info == 0 ) + { + alloced_total = ( 15 + (size_t)advance_mem ); + alloced = STBIR_MALLOC( alloced_total, user_data ); + if ( alloced == 0 ) + return 0; + } + else + return info; // success + } +} + +static int stbir__perform_resize( stbir__info const * info, int split_start, int split_count ) +{ + stbir__per_split_info * split_info = info->split_info + split_start; + + STBIR_PROFILE_CLEAR_EXTRAS(); + + STBIR_PROFILE_FIRST_START( looping ); + if (info->vertical.is_gather) + stbir__vertical_gather_loop( info, split_info, split_count ); + else + stbir__vertical_scatter_loop( info, split_info, split_count ); + STBIR_PROFILE_END( looping ); + + return 1; +} + +static void stbir__update_info_from_resize( stbir__info * info, STBIR_RESIZE * resize ) +{ + static stbir__decode_pixels_func * decode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]= + { + /* 1ch-4ch */ stbir__decode_uint8_srgb, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear, + }; + + static stbir__decode_pixels_func * decode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]= + { + { /* RGBA */ stbir__decode_uint8_srgb4_linearalpha, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear }, + { /* BGRA */ stbir__decode_uint8_srgb4_linearalpha_BGRA, stbir__decode_uint8_srgb_BGRA, 0, stbir__decode_float_linear_BGRA, stbir__decode_half_float_linear_BGRA }, + { /* ARGB */ stbir__decode_uint8_srgb4_linearalpha_ARGB, stbir__decode_uint8_srgb_ARGB, 0, stbir__decode_float_linear_ARGB, stbir__decode_half_float_linear_ARGB }, + { /* ABGR */ stbir__decode_uint8_srgb4_linearalpha_ABGR, stbir__decode_uint8_srgb_ABGR, 0, stbir__decode_float_linear_ABGR, stbir__decode_half_float_linear_ABGR }, + { /* RA */ stbir__decode_uint8_srgb2_linearalpha, stbir__decode_uint8_srgb, 0, stbir__decode_float_linear, stbir__decode_half_float_linear }, + { /* AR */ stbir__decode_uint8_srgb2_linearalpha_AR, stbir__decode_uint8_srgb_AR, 0, stbir__decode_float_linear_AR, stbir__decode_half_float_linear_AR }, + }; + + static stbir__decode_pixels_func * decode_simple_scaled_or_not[2][2]= + { + { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear }, + }; + + static stbir__decode_pixels_func * decode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]= + { + { /* RGBA */ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear } }, + { /* BGRA */ { stbir__decode_uint8_linear_scaled_BGRA, stbir__decode_uint8_linear_BGRA }, { stbir__decode_uint16_linear_scaled_BGRA, stbir__decode_uint16_linear_BGRA } }, + { /* ARGB */ { stbir__decode_uint8_linear_scaled_ARGB, stbir__decode_uint8_linear_ARGB }, { stbir__decode_uint16_linear_scaled_ARGB, stbir__decode_uint16_linear_ARGB } }, + { /* ABGR */ { stbir__decode_uint8_linear_scaled_ABGR, stbir__decode_uint8_linear_ABGR }, { stbir__decode_uint16_linear_scaled_ABGR, stbir__decode_uint16_linear_ABGR } }, + { /* RA */ { stbir__decode_uint8_linear_scaled, stbir__decode_uint8_linear }, { stbir__decode_uint16_linear_scaled, stbir__decode_uint16_linear } }, + { /* AR */ { stbir__decode_uint8_linear_scaled_AR, stbir__decode_uint8_linear_AR }, { stbir__decode_uint16_linear_scaled_AR, stbir__decode_uint16_linear_AR } } + }; + + static stbir__encode_pixels_func * encode_simple[STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]= + { + /* 1ch-4ch */ stbir__encode_uint8_srgb, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear, + }; + + static stbir__encode_pixels_func * encode_alphas[STBIRI_AR-STBIRI_RGBA+1][STBIR_TYPE_HALF_FLOAT-STBIR_TYPE_UINT8_SRGB+1]= + { + { /* RGBA */ stbir__encode_uint8_srgb4_linearalpha, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear }, + { /* BGRA */ stbir__encode_uint8_srgb4_linearalpha_BGRA, stbir__encode_uint8_srgb_BGRA, 0, stbir__encode_float_linear_BGRA, stbir__encode_half_float_linear_BGRA }, + { /* ARGB */ stbir__encode_uint8_srgb4_linearalpha_ARGB, stbir__encode_uint8_srgb_ARGB, 0, stbir__encode_float_linear_ARGB, stbir__encode_half_float_linear_ARGB }, + { /* ABGR */ stbir__encode_uint8_srgb4_linearalpha_ABGR, stbir__encode_uint8_srgb_ABGR, 0, stbir__encode_float_linear_ABGR, stbir__encode_half_float_linear_ABGR }, + { /* RA */ stbir__encode_uint8_srgb2_linearalpha, stbir__encode_uint8_srgb, 0, stbir__encode_float_linear, stbir__encode_half_float_linear }, + { /* AR */ stbir__encode_uint8_srgb2_linearalpha_AR, stbir__encode_uint8_srgb_AR, 0, stbir__encode_float_linear_AR, stbir__encode_half_float_linear_AR } + }; + + static stbir__encode_pixels_func * encode_simple_scaled_or_not[2][2]= + { + { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear }, + }; + + static stbir__encode_pixels_func * encode_alphas_scaled_or_not[STBIRI_AR-STBIRI_RGBA+1][2][2]= + { + { /* RGBA */ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear } }, + { /* BGRA */ { stbir__encode_uint8_linear_scaled_BGRA, stbir__encode_uint8_linear_BGRA }, { stbir__encode_uint16_linear_scaled_BGRA, stbir__encode_uint16_linear_BGRA } }, + { /* ARGB */ { stbir__encode_uint8_linear_scaled_ARGB, stbir__encode_uint8_linear_ARGB }, { stbir__encode_uint16_linear_scaled_ARGB, stbir__encode_uint16_linear_ARGB } }, + { /* ABGR */ { stbir__encode_uint8_linear_scaled_ABGR, stbir__encode_uint8_linear_ABGR }, { stbir__encode_uint16_linear_scaled_ABGR, stbir__encode_uint16_linear_ABGR } }, + { /* RA */ { stbir__encode_uint8_linear_scaled, stbir__encode_uint8_linear }, { stbir__encode_uint16_linear_scaled, stbir__encode_uint16_linear } }, + { /* AR */ { stbir__encode_uint8_linear_scaled_AR, stbir__encode_uint8_linear_AR }, { stbir__encode_uint16_linear_scaled_AR, stbir__encode_uint16_linear_AR } } + }; + + stbir__decode_pixels_func * decode_pixels = 0; + stbir__encode_pixels_func * encode_pixels = 0; + stbir_datatype input_type, output_type; + + input_type = resize->input_data_type; + output_type = resize->output_data_type; + info->input_data = resize->input_pixels; + info->input_stride_bytes = resize->input_stride_in_bytes; + info->output_stride_bytes = resize->output_stride_in_bytes; + + // if we're completely point sampling, then we can turn off SRGB + if ( ( info->horizontal.filter_enum == STBIR_FILTER_POINT_SAMPLE ) && ( info->vertical.filter_enum == STBIR_FILTER_POINT_SAMPLE ) ) + { + if ( ( ( input_type == STBIR_TYPE_UINT8_SRGB ) || ( input_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) && + ( ( output_type == STBIR_TYPE_UINT8_SRGB ) || ( output_type == STBIR_TYPE_UINT8_SRGB_ALPHA ) ) ) + { + input_type = STBIR_TYPE_UINT8; + output_type = STBIR_TYPE_UINT8; + } + } + + // recalc the output and input strides + if ( info->input_stride_bytes == 0 ) + info->input_stride_bytes = info->channels * info->horizontal.scale_info.input_full_size * stbir__type_size[input_type]; + + if ( info->output_stride_bytes == 0 ) + info->output_stride_bytes = info->channels * info->horizontal.scale_info.output_sub_size * stbir__type_size[output_type]; + + // calc offset + info->output_data = ( (char*) resize->output_pixels ) + ( (size_t) info->offset_y * (size_t) resize->output_stride_in_bytes ) + ( info->offset_x * info->channels * stbir__type_size[output_type] ); + + info->in_pixels_cb = resize->input_cb; + info->user_data = resize->user_data; + info->out_pixels_cb = resize->output_cb; + + // setup the input format converters + if ( ( input_type == STBIR_TYPE_UINT8 ) || ( input_type == STBIR_TYPE_UINT16 ) ) + { + int non_scaled = 0; + + // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16) + if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual) + if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) ) + non_scaled = 1; + + if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL ) + decode_pixels = decode_simple_scaled_or_not[ input_type == STBIR_TYPE_UINT16 ][ non_scaled ]; + else + decode_pixels = decode_alphas_scaled_or_not[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type == STBIR_TYPE_UINT16 ][ non_scaled ]; + } + else + { + if ( info->input_pixel_layout_internal <= STBIRI_4CHANNEL ) + decode_pixels = decode_simple[ input_type - STBIR_TYPE_UINT8_SRGB ]; + else + decode_pixels = decode_alphas[ ( info->input_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ input_type - STBIR_TYPE_UINT8_SRGB ]; + } + + // setup the output format converters + if ( ( output_type == STBIR_TYPE_UINT8 ) || ( output_type == STBIR_TYPE_UINT16 ) ) + { + int non_scaled = 0; + + // check if we can run unscaled - 0-255.0/0-65535.0 instead of 0-1.0 (which is a tiny bit faster when doing linear 8->8 or 16->16) + if ( ( !info->alpha_weight ) && ( !info->alpha_unweight ) ) // don't short circuit when alpha weighting (get everything to 0-1.0 as usual) + if ( ( ( input_type == STBIR_TYPE_UINT8 ) && ( output_type == STBIR_TYPE_UINT8 ) ) || ( ( input_type == STBIR_TYPE_UINT16 ) && ( output_type == STBIR_TYPE_UINT16 ) ) ) + non_scaled = 1; + + if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL ) + encode_pixels = encode_simple_scaled_or_not[ output_type == STBIR_TYPE_UINT16 ][ non_scaled ]; + else + encode_pixels = encode_alphas_scaled_or_not[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type == STBIR_TYPE_UINT16 ][ non_scaled ]; + } + else + { + if ( info->output_pixel_layout_internal <= STBIRI_4CHANNEL ) + encode_pixels = encode_simple[ output_type - STBIR_TYPE_UINT8_SRGB ]; + else + encode_pixels = encode_alphas[ ( info->output_pixel_layout_internal - STBIRI_RGBA ) % ( STBIRI_AR-STBIRI_RGBA+1 ) ][ output_type - STBIR_TYPE_UINT8_SRGB ]; + } + + info->input_type = input_type; + info->output_type = output_type; + info->decode_pixels = decode_pixels; + info->encode_pixels = encode_pixels; +} + +static void stbir__clip( int * outx, int * outsubw, int outw, double * u0, double * u1 ) +{ + double per, adj; + int over; + + // do left/top edge + if ( *outx < 0 ) + { + per = ( (double)*outx ) / ( (double)*outsubw ); // is negative + adj = per * ( *u1 - *u0 ); + *u0 -= adj; // increases u0 + *outx = 0; + } + + // do right/bot edge + over = outw - ( *outx + *outsubw ); + if ( over < 0 ) + { + per = ( (double)over ) / ( (double)*outsubw ); // is negative + adj = per * ( *u1 - *u0 ); + *u1 += adj; // decrease u1 + *outsubw = outw - *outx; + } +} + +// converts a double to a rational that has less than one float bit of error (returns 0 if unable to do so) +static int stbir__double_to_rational(double f, stbir_uint32 limit, stbir_uint32 *numer, stbir_uint32 *denom, int limit_denom ) // limit_denom (1) or limit numer (0) +{ + double err; + stbir_uint64 top, bot; + stbir_uint64 numer_last = 0; + stbir_uint64 denom_last = 1; + stbir_uint64 numer_estimate = 1; + stbir_uint64 denom_estimate = 0; + + // scale to past float error range + top = (stbir_uint64)( f * (double)(1 << 25) ); + bot = 1 << 25; + + // keep refining, but usually stops in a few loops - usually 5 for bad cases + for(;;) + { + stbir_uint64 est, temp; + + // hit limit, break out and do best full range estimate + if ( ( ( limit_denom ) ? denom_estimate : numer_estimate ) >= limit ) + break; + + // is the current error less than 1 bit of a float? if so, we're done + if ( denom_estimate ) + { + err = ( (double)numer_estimate / (double)denom_estimate ) - f; + if ( err < 0.0 ) err = -err; + if ( err < ( 1.0 / (double)(1<<24) ) ) + { + // yup, found it + *numer = (stbir_uint32) numer_estimate; + *denom = (stbir_uint32) denom_estimate; + return 1; + } + } + + // no more refinement bits left? break out and do full range estimate + if ( bot == 0 ) + break; + + // gcd the estimate bits + est = top / bot; + temp = top % bot; + top = bot; + bot = temp; + + // move remainders + temp = est * denom_estimate + denom_last; + denom_last = denom_estimate; + denom_estimate = temp; + + // move remainders + temp = est * numer_estimate + numer_last; + numer_last = numer_estimate; + numer_estimate = temp; + } + + // we didn't fine anything good enough for float, use a full range estimate + if ( limit_denom ) + { + numer_estimate= (stbir_uint64)( f * (double)limit + 0.5 ); + denom_estimate = limit; + } + else + { + numer_estimate = limit; + denom_estimate = (stbir_uint64)( ( (double)limit / f ) + 0.5 ); + } + + *numer = (stbir_uint32) numer_estimate; + *denom = (stbir_uint32) denom_estimate; + + err = ( denom_estimate ) ? ( ( (double)(stbir_uint32)numer_estimate / (double)(stbir_uint32)denom_estimate ) - f ) : 1.0; + if ( err < 0.0 ) err = -err; + return ( err < ( 1.0 / (double)(1<<24) ) ) ? 1 : 0; +} + +static int stbir__calculate_region_transform( stbir__scale_info * scale_info, int output_full_range, int * output_offset, int output_sub_range, int input_full_range, double input_s0, double input_s1 ) +{ + double output_range, input_range, output_s, input_s, ratio, scale; + + input_s = input_s1 - input_s0; + + // null area + if ( ( output_full_range == 0 ) || ( input_full_range == 0 ) || + ( output_sub_range == 0 ) || ( input_s <= stbir__small_float ) ) + return 0; + + // are either of the ranges completely out of bounds? + if ( ( *output_offset >= output_full_range ) || ( ( *output_offset + output_sub_range ) <= 0 ) || ( input_s0 >= (1.0f-stbir__small_float) ) || ( input_s1 <= stbir__small_float ) ) + return 0; + + output_range = (double)output_full_range; + input_range = (double)input_full_range; + + output_s = ( (double)output_sub_range) / output_range; + + // figure out the scaling to use + ratio = output_s / input_s; + + // save scale before clipping + scale = ( output_range / input_range ) * ratio; + scale_info->scale = (float)scale; + scale_info->inv_scale = (float)( 1.0 / scale ); + + // clip output area to left/right output edges (and adjust input area) + stbir__clip( output_offset, &output_sub_range, output_full_range, &input_s0, &input_s1 ); + + // recalc input area + input_s = input_s1 - input_s0; + + // after clipping do we have zero input area? + if ( input_s <= stbir__small_float ) + return 0; + + // calculate and store the starting source offsets in output pixel space + scale_info->pixel_shift = (float) ( input_s0 * ratio * output_range ); + + scale_info->scale_is_rational = stbir__double_to_rational( scale, ( scale <= 1.0 ) ? output_full_range : input_full_range, &scale_info->scale_numerator, &scale_info->scale_denominator, ( scale >= 1.0 ) ); + + scale_info->input_full_size = input_full_range; + scale_info->output_sub_size = output_sub_range; + + return 1; +} + + +static void stbir__init_and_set_layout( STBIR_RESIZE * resize, stbir_pixel_layout pixel_layout, stbir_datatype data_type ) +{ + resize->input_cb = 0; + resize->output_cb = 0; + resize->user_data = resize; + resize->samplers = 0; + resize->called_alloc = 0; + resize->horizontal_filter = STBIR_FILTER_DEFAULT; + resize->horizontal_filter_kernel = 0; resize->horizontal_filter_support = 0; + resize->vertical_filter = STBIR_FILTER_DEFAULT; + resize->vertical_filter_kernel = 0; resize->vertical_filter_support = 0; + resize->horizontal_edge = STBIR_EDGE_CLAMP; + resize->vertical_edge = STBIR_EDGE_CLAMP; + resize->input_s0 = 0; resize->input_t0 = 0; resize->input_s1 = 1; resize->input_t1 = 1; + resize->output_subx = 0; resize->output_suby = 0; resize->output_subw = resize->output_w; resize->output_subh = resize->output_h; + resize->input_data_type = data_type; + resize->output_data_type = data_type; + resize->input_pixel_layout_public = pixel_layout; + resize->output_pixel_layout_public = pixel_layout; + resize->needs_rebuild = 1; +} + +STBIRDEF void stbir_resize_init( STBIR_RESIZE * resize, + const void *input_pixels, int input_w, int input_h, int input_stride_in_bytes, // stride can be zero + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, // stride can be zero + stbir_pixel_layout pixel_layout, stbir_datatype data_type ) +{ + resize->input_pixels = input_pixels; + resize->input_w = input_w; + resize->input_h = input_h; + resize->input_stride_in_bytes = input_stride_in_bytes; + resize->output_pixels = output_pixels; + resize->output_w = output_w; + resize->output_h = output_h; + resize->output_stride_in_bytes = output_stride_in_bytes; + resize->fast_alpha = 0; + + stbir__init_and_set_layout( resize, pixel_layout, data_type ); +} + +// You can update parameters any time after resize_init +STBIRDEF void stbir_set_datatypes( STBIR_RESIZE * resize, stbir_datatype input_type, stbir_datatype output_type ) // by default, datatype from resize_init +{ + resize->input_data_type = input_type; + resize->output_data_type = output_type; + if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) + stbir__update_info_from_resize( resize->samplers, resize ); +} + +STBIRDEF void stbir_set_pixel_callbacks( STBIR_RESIZE * resize, stbir_input_callback * input_cb, stbir_output_callback * output_cb ) // no callbacks by default +{ + resize->input_cb = input_cb; + resize->output_cb = output_cb; + + if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) + { + resize->samplers->in_pixels_cb = input_cb; + resize->samplers->out_pixels_cb = output_cb; + } +} + +STBIRDEF void stbir_set_user_data( STBIR_RESIZE * resize, void * user_data ) // pass back STBIR_RESIZE* by default +{ + resize->user_data = user_data; + if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) + resize->samplers->user_data = user_data; +} + +STBIRDEF void stbir_set_buffer_ptrs( STBIR_RESIZE * resize, const void * input_pixels, int input_stride_in_bytes, void * output_pixels, int output_stride_in_bytes ) +{ + resize->input_pixels = input_pixels; + resize->input_stride_in_bytes = input_stride_in_bytes; + resize->output_pixels = output_pixels; + resize->output_stride_in_bytes = output_stride_in_bytes; + if ( ( resize->samplers ) && ( !resize->needs_rebuild ) ) + stbir__update_info_from_resize( resize->samplers, resize ); +} + + +STBIRDEF int stbir_set_edgemodes( STBIR_RESIZE * resize, stbir_edge horizontal_edge, stbir_edge vertical_edge ) // CLAMP by default +{ + resize->horizontal_edge = horizontal_edge; + resize->vertical_edge = vertical_edge; + resize->needs_rebuild = 1; + return 1; +} + +STBIRDEF int stbir_set_filters( STBIR_RESIZE * resize, stbir_filter horizontal_filter, stbir_filter vertical_filter ) // STBIR_DEFAULT_FILTER_UPSAMPLE/DOWNSAMPLE by default +{ + resize->horizontal_filter = horizontal_filter; + resize->vertical_filter = vertical_filter; + resize->needs_rebuild = 1; + return 1; +} + +STBIRDEF int stbir_set_filter_callbacks( STBIR_RESIZE * resize, stbir__kernel_callback * horizontal_filter, stbir__support_callback * horizontal_support, stbir__kernel_callback * vertical_filter, stbir__support_callback * vertical_support ) +{ + resize->horizontal_filter_kernel = horizontal_filter; resize->horizontal_filter_support = horizontal_support; + resize->vertical_filter_kernel = vertical_filter; resize->vertical_filter_support = vertical_support; + resize->needs_rebuild = 1; + return 1; +} + +STBIRDEF int stbir_set_pixel_layouts( STBIR_RESIZE * resize, stbir_pixel_layout input_pixel_layout, stbir_pixel_layout output_pixel_layout ) // sets new pixel layouts +{ + resize->input_pixel_layout_public = input_pixel_layout; + resize->output_pixel_layout_public = output_pixel_layout; + resize->needs_rebuild = 1; + return 1; +} + + +STBIRDEF int stbir_set_non_pm_alpha_speed_over_quality( STBIR_RESIZE * resize, int non_pma_alpha_speed_over_quality ) // sets alpha speed +{ + resize->fast_alpha = non_pma_alpha_speed_over_quality; + resize->needs_rebuild = 1; + return 1; +} + +STBIRDEF int stbir_set_input_subrect( STBIR_RESIZE * resize, double s0, double t0, double s1, double t1 ) // sets input region (full region by default) +{ + resize->input_s0 = s0; + resize->input_t0 = t0; + resize->input_s1 = s1; + resize->input_t1 = t1; + resize->needs_rebuild = 1; + + // are we inbounds? + if ( ( s1 < stbir__small_float ) || ( (s1-s0) < stbir__small_float ) || + ( t1 < stbir__small_float ) || ( (t1-t0) < stbir__small_float ) || + ( s0 > (1.0f-stbir__small_float) ) || + ( t0 > (1.0f-stbir__small_float) ) ) + return 0; + + return 1; +} + +STBIRDEF int stbir_set_output_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ) // sets input region (full region by default) +{ + resize->output_subx = subx; + resize->output_suby = suby; + resize->output_subw = subw; + resize->output_subh = subh; + resize->needs_rebuild = 1; + + // are we inbounds? + if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) ) + return 0; + + return 1; +} + +STBIRDEF int stbir_set_pixel_subrect( STBIR_RESIZE * resize, int subx, int suby, int subw, int subh ) // sets both regions (full regions by default) +{ + double s0, t0, s1, t1; + + s0 = ( (double)subx ) / ( (double)resize->output_w ); + t0 = ( (double)suby ) / ( (double)resize->output_h ); + s1 = ( (double)(subx+subw) ) / ( (double)resize->output_w ); + t1 = ( (double)(suby+subh) ) / ( (double)resize->output_h ); + + resize->input_s0 = s0; + resize->input_t0 = t0; + resize->input_s1 = s1; + resize->input_t1 = t1; + resize->output_subx = subx; + resize->output_suby = suby; + resize->output_subw = subw; + resize->output_subh = subh; + resize->needs_rebuild = 1; + + // are we inbounds? + if ( ( subx >= resize->output_w ) || ( ( subx + subw ) <= 0 ) || ( suby >= resize->output_h ) || ( ( suby + subh ) <= 0 ) || ( subw == 0 ) || ( subh == 0 ) ) + return 0; + + return 1; +} + +static int stbir__perform_build( STBIR_RESIZE * resize, int splits ) +{ + stbir__contributors conservative = { 0, 0 }; + stbir__sampler horizontal, vertical; + int new_output_subx, new_output_suby; + stbir__info * out_info; + #ifdef STBIR_PROFILE + stbir__info profile_infod; // used to contain building profile info before everything is allocated + stbir__info * profile_info = &profile_infod; + #endif + + // have we already built the samplers? + if ( resize->samplers ) + return 0; + + #define STBIR_RETURN_ERROR_AND_ASSERT( exp ) STBIR_ASSERT( !(exp) ); if (exp) return 0; + STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->horizontal_filter >= STBIR_FILTER_OTHER) + STBIR_RETURN_ERROR_AND_ASSERT( (unsigned)resize->vertical_filter >= STBIR_FILTER_OTHER) + #undef STBIR_RETURN_ERROR_AND_ASSERT + + if ( splits <= 0 ) + return 0; + + STBIR_PROFILE_BUILD_FIRST_START( build ); + + new_output_subx = resize->output_subx; + new_output_suby = resize->output_suby; + + // do horizontal clip and scale calcs + if ( !stbir__calculate_region_transform( &horizontal.scale_info, resize->output_w, &new_output_subx, resize->output_subw, resize->input_w, resize->input_s0, resize->input_s1 ) ) + return 0; + + // do vertical clip and scale calcs + if ( !stbir__calculate_region_transform( &vertical.scale_info, resize->output_h, &new_output_suby, resize->output_subh, resize->input_h, resize->input_t0, resize->input_t1 ) ) + return 0; + + // if nothing to do, just return + if ( ( horizontal.scale_info.output_sub_size == 0 ) || ( vertical.scale_info.output_sub_size == 0 ) ) + return 0; + + stbir__set_sampler(&horizontal, resize->horizontal_filter, resize->horizontal_filter_kernel, resize->horizontal_filter_support, resize->horizontal_edge, &horizontal.scale_info, 1, resize->user_data ); + stbir__get_conservative_extents( &horizontal, &conservative, resize->user_data ); + stbir__set_sampler(&vertical, resize->vertical_filter, resize->vertical_filter_kernel, resize->vertical_filter_support, resize->vertical_edge, &vertical.scale_info, 0, resize->user_data ); + + if ( ( vertical.scale_info.output_sub_size / splits ) < STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS ) // each split should be a minimum of 4 scanlines (handwavey choice) + { + splits = vertical.scale_info.output_sub_size / STBIR_FORCE_MINIMUM_SCANLINES_FOR_SPLITS; + if ( splits == 0 ) splits = 1; + } + + STBIR_PROFILE_BUILD_START( alloc ); + out_info = stbir__alloc_internal_mem_and_build_samplers( &horizontal, &vertical, &conservative, resize->input_pixel_layout_public, resize->output_pixel_layout_public, splits, new_output_subx, new_output_suby, resize->fast_alpha, resize->user_data STBIR_ONLY_PROFILE_BUILD_SET_INFO ); + STBIR_PROFILE_BUILD_END( alloc ); + STBIR_PROFILE_BUILD_END( build ); + + if ( out_info ) + { + resize->splits = splits; + resize->samplers = out_info; + resize->needs_rebuild = 0; + #ifdef STBIR_PROFILE + STBIR_MEMCPY( &out_info->profile, &profile_infod.profile, sizeof( out_info->profile ) ); + #endif + + // update anything that can be changed without recalcing samplers + stbir__update_info_from_resize( out_info, resize ); + + return splits; + } + + return 0; +} + +void stbir_free_samplers( STBIR_RESIZE * resize ) +{ + if ( resize->samplers ) + { + stbir__free_internal_mem( resize->samplers ); + resize->samplers = 0; + resize->called_alloc = 0; + } +} + +STBIRDEF int stbir_build_samplers_with_splits( STBIR_RESIZE * resize, int splits ) +{ + if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) ) + { + if ( resize->samplers ) + stbir_free_samplers( resize ); + + resize->called_alloc = 1; + return stbir__perform_build( resize, splits ); + } + + STBIR_PROFILE_BUILD_CLEAR( resize->samplers ); + + return 1; +} + +STBIRDEF int stbir_build_samplers( STBIR_RESIZE * resize ) +{ + return stbir_build_samplers_with_splits( resize, 1 ); +} + +STBIRDEF int stbir_resize_extended( STBIR_RESIZE * resize ) +{ + int result; + + if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) ) + { + int alloc_state = resize->called_alloc; // remember allocated state + + if ( resize->samplers ) + { + stbir__free_internal_mem( resize->samplers ); + resize->samplers = 0; + } + + if ( !stbir_build_samplers( resize ) ) + return 0; + + resize->called_alloc = alloc_state; + + // if build_samplers succeeded (above), but there are no samplers set, then + // the area to stretch into was zero pixels, so don't do anything and return + // success + if ( resize->samplers == 0 ) + return 1; + } + else + { + // didn't build anything - clear it + STBIR_PROFILE_BUILD_CLEAR( resize->samplers ); + } + + // do resize + result = stbir__perform_resize( resize->samplers, 0, resize->splits ); + + // if we alloced, then free + if ( !resize->called_alloc ) + { + stbir_free_samplers( resize ); + resize->samplers = 0; + } + + return result; +} + +STBIRDEF int stbir_resize_extended_split( STBIR_RESIZE * resize, int split_start, int split_count ) +{ + STBIR_ASSERT( resize->samplers ); + + // if we're just doing the whole thing, call full + if ( ( split_start == -1 ) || ( ( split_start == 0 ) && ( split_count == resize->splits ) ) ) + return stbir_resize_extended( resize ); + + // you **must** build samplers first when using split resize + if ( ( resize->samplers == 0 ) || ( resize->needs_rebuild ) ) + return 0; + + if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) ) + return 0; + + // do resize + return stbir__perform_resize( resize->samplers, split_start, split_count ); +} + + +static void * stbir_quick_resize_helper( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout, stbir_datatype data_type, stbir_edge edge, stbir_filter filter ) +{ + STBIR_RESIZE resize; + int scanline_output_in_bytes; + int positive_output_stride_in_bytes; + void * start_ptr; + void * free_ptr; + + scanline_output_in_bytes = output_w * stbir__type_size[ data_type ] * stbir__pixel_channels[ stbir__pixel_layout_convert_public_to_internal[ pixel_layout ] ]; + if ( scanline_output_in_bytes == 0 ) + return 0; + + // if zero stride, use scanline output + if ( output_stride_in_bytes == 0 ) + output_stride_in_bytes = scanline_output_in_bytes; + + // abs value for inverted images (negative pitches) + positive_output_stride_in_bytes = output_stride_in_bytes; + if ( positive_output_stride_in_bytes < 0 ) + positive_output_stride_in_bytes = -positive_output_stride_in_bytes; + + // is the requested stride smaller than the scanline output? if so, just fail + if ( positive_output_stride_in_bytes < scanline_output_in_bytes ) + return 0; + + start_ptr = output_pixels; + free_ptr = 0; // no free pointer, since they passed buffer to use + + // did they pass a zero for the dest? if so, allocate the buffer + if ( output_pixels == 0 ) + { + size_t size; + char * ptr; + + size = (size_t)positive_output_stride_in_bytes * (size_t)output_h; + if ( size == 0 ) + return 0; + + ptr = (char*) STBIR_MALLOC( size, 0 ); + if ( ptr == 0 ) + return 0; + + free_ptr = ptr; + + // point at the last scanline, if they requested a flipped image + if ( output_stride_in_bytes < 0 ) + start_ptr = ptr + ( (size_t)positive_output_stride_in_bytes * (size_t)( output_h - 1 ) ); + else + start_ptr = ptr; + } + + // ok, now do the resize + stbir_resize_init( &resize, + input_pixels, input_w, input_h, input_stride_in_bytes, + start_ptr, output_w, output_h, output_stride_in_bytes, + pixel_layout, data_type ); + + resize.horizontal_edge = edge; + resize.vertical_edge = edge; + resize.horizontal_filter = filter; + resize.vertical_filter = filter; + + if ( !stbir_resize_extended( &resize ) ) + { + if ( free_ptr ) + STBIR_FREE( free_ptr, 0 ); + return 0; + } + + return (free_ptr) ? free_ptr : start_ptr; +} + + + +STBIRDEF unsigned char * stbir_resize_uint8_linear( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout ) +{ + return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, STBIR_TYPE_UINT8, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT ); +} + +STBIRDEF unsigned char * stbir_resize_uint8_srgb( const unsigned char *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout ) +{ + return (unsigned char *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, STBIR_TYPE_UINT8_SRGB, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT ); +} + + +STBIRDEF float * stbir_resize_float_linear( const float *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + float *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout ) +{ + return (float *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, STBIR_TYPE_FLOAT, STBIR_EDGE_CLAMP, STBIR_FILTER_DEFAULT ); +} + + +STBIRDEF void * stbir_resize( const void *input_pixels , int input_w , int input_h, int input_stride_in_bytes, + void *output_pixels, int output_w, int output_h, int output_stride_in_bytes, + stbir_pixel_layout pixel_layout, stbir_datatype data_type, + stbir_edge edge, stbir_filter filter ) +{ + return (void *) stbir_quick_resize_helper( input_pixels , input_w , input_h, input_stride_in_bytes, + output_pixels, output_w, output_h, output_stride_in_bytes, + pixel_layout, data_type, edge, filter ); +} + +#ifdef STBIR_PROFILE + +STBIRDEF void stbir_resize_build_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize ) +{ + static char const * bdescriptions[6] = { "Building", "Allocating", "Horizontal sampler", "Vertical sampler", "Coefficient cleanup", "Coefficient piovot" } ; + stbir__info* samp = resize->samplers; + int i; + + typedef int testa[ (STBIR__ARRAY_SIZE( bdescriptions ) == (STBIR__ARRAY_SIZE( samp->profile.array )-1) )?1:-1]; + typedef int testb[ (sizeof( samp->profile.array ) == (sizeof(samp->profile.named)) )?1:-1]; + typedef int testc[ (sizeof( info->clocks ) >= (sizeof(samp->profile.named)) )?1:-1]; + + for( i = 0 ; i < STBIR__ARRAY_SIZE( bdescriptions ) ; i++) + info->clocks[i] = samp->profile.array[i+1]; + + info->total_clocks = samp->profile.named.total; + info->descriptions = bdescriptions; + info->count = STBIR__ARRAY_SIZE( bdescriptions ); +} + +STBIRDEF void stbir_resize_split_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize, int split_start, int split_count ) +{ + static char const * descriptions[7] = { "Looping", "Vertical sampling", "Horizontal sampling", "Scanline input", "Scanline output", "Alpha weighting", "Alpha unweighting" }; + stbir__per_split_info * split_info; + int s, i; + + typedef int testa[ (STBIR__ARRAY_SIZE( descriptions ) == (STBIR__ARRAY_SIZE( split_info->profile.array )-1) )?1:-1]; + typedef int testb[ (sizeof( split_info->profile.array ) == (sizeof(split_info->profile.named)) )?1:-1]; + typedef int testc[ (sizeof( info->clocks ) >= (sizeof(split_info->profile.named)) )?1:-1]; + + if ( split_start == -1 ) + { + split_start = 0; + split_count = resize->samplers->splits; + } + + if ( ( split_start >= resize->splits ) || ( split_start < 0 ) || ( ( split_start + split_count ) > resize->splits ) || ( split_count <= 0 ) ) + { + info->total_clocks = 0; + info->descriptions = 0; + info->count = 0; + return; + } + + split_info = resize->samplers->split_info + split_start; + + // sum up the profile from all the splits + for( i = 0 ; i < STBIR__ARRAY_SIZE( descriptions ) ; i++ ) + { + stbir_uint64 sum = 0; + for( s = 0 ; s < split_count ; s++ ) + sum += split_info[s].profile.array[i+1]; + info->clocks[i] = sum; + } + + info->total_clocks = split_info->profile.named.total; + info->descriptions = descriptions; + info->count = STBIR__ARRAY_SIZE( descriptions ); +} + +STBIRDEF void stbir_resize_extended_profile_info( STBIR_PROFILE_INFO * info, STBIR_RESIZE const * resize ) +{ + stbir_resize_split_profile_info( info, resize, -1, 0 ); +} + +#endif // STBIR_PROFILE + +#undef STBIR_BGR +#undef STBIR_1CHANNEL +#undef STBIR_2CHANNEL +#undef STBIR_RGB +#undef STBIR_RGBA +#undef STBIR_4CHANNEL +#undef STBIR_BGRA +#undef STBIR_ARGB +#undef STBIR_ABGR +#undef STBIR_RA +#undef STBIR_AR +#undef STBIR_RGBA_PM +#undef STBIR_BGRA_PM +#undef STBIR_ARGB_PM +#undef STBIR_ABGR_PM +#undef STBIR_RA_PM +#undef STBIR_AR_PM + +#endif // STB_IMAGE_RESIZE_IMPLEMENTATION + +#else // STB_IMAGE_RESIZE_HORIZONTALS&STB_IMAGE_RESIZE_DO_VERTICALS + +// we reinclude the header file to define all the horizontal functions +// specializing each function for the number of coeffs is 20-40% faster *OVERALL* + +// by including the header file again this way, we can still debug the functions + +#define STBIR_strs_join2( start, mid, end ) start##mid##end +#define STBIR_strs_join1( start, mid, end ) STBIR_strs_join2( start, mid, end ) + +#define STBIR_strs_join24( start, mid1, mid2, end ) start##mid1##mid2##end +#define STBIR_strs_join14( start, mid1, mid2, end ) STBIR_strs_join24( start, mid1, mid2, end ) + +#ifdef STB_IMAGE_RESIZE_DO_CODERS + +#ifdef stbir__decode_suffix +#define STBIR__CODER_NAME( name ) STBIR_strs_join1( name, _, stbir__decode_suffix ) +#else +#define STBIR__CODER_NAME( name ) name +#endif + +#ifdef stbir__decode_swizzle +#define stbir__decode_simdf8_flip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3),stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg) +#define stbir__decode_simdf4_flip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__decode_order0,stbir__decode_order1),stbir__decode_order2,stbir__decode_order3)(reg, reg) +#define stbir__encode_simdf8_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( STBIR_strs_join1( stbir__simdf8_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3),stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg) +#define stbir__encode_simdf4_unflip(reg) STBIR_strs_join1( STBIR_strs_join1( stbir__simdf_0123to,stbir__encode_order0,stbir__encode_order1),stbir__encode_order2,stbir__encode_order3)(reg, reg) +#else +#define stbir__decode_order0 0 +#define stbir__decode_order1 1 +#define stbir__decode_order2 2 +#define stbir__decode_order3 3 +#define stbir__encode_order0 0 +#define stbir__encode_order1 1 +#define stbir__encode_order2 2 +#define stbir__encode_order3 3 +#define stbir__decode_simdf8_flip(reg) +#define stbir__decode_simdf4_flip(reg) +#define stbir__encode_simdf8_unflip(reg) +#define stbir__encode_simdf4_unflip(reg) +#endif + +#ifdef STBIR_SIMD8 +#define stbir__encode_simdfX_unflip stbir__encode_simdf8_unflip +#else +#define stbir__encode_simdfX_unflip stbir__encode_simdf4_unflip +#endif + +static float * STBIR__CODER_NAME( stbir__decode_uint8_linear_scaled )( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned char const * input = (unsigned char const*)inputp; + + #ifdef STBIR_SIMD + unsigned char const * end_input_m16 = input + width_times_channels - 16; + if ( width_times_channels >= 16 ) + { + decode_end -= 16; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + #ifdef STBIR_SIMD8 + stbir__simdi i; stbir__simdi8 o0,o1; + stbir__simdf8 of0, of1; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi8_expand_u8_to_u32( o0, o1, i ); + stbir__simdi8_convert_i32_to_float( of0, o0 ); + stbir__simdi8_convert_i32_to_float( of1, o1 ); + stbir__simdf8_mult( of0, of0, STBIR_max_uint8_as_float_inverted8); + stbir__simdf8_mult( of1, of1, STBIR_max_uint8_as_float_inverted8); + stbir__decode_simdf8_flip( of0 ); + stbir__decode_simdf8_flip( of1 ); + stbir__simdf8_store( decode + 0, of0 ); + stbir__simdf8_store( decode + 8, of1 ); + #else + stbir__simdi i, o0, o1, o2, o3; + stbir__simdf of0, of1, of2, of3; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i); + stbir__simdi_convert_i32_to_float( of0, o0 ); + stbir__simdi_convert_i32_to_float( of1, o1 ); + stbir__simdi_convert_i32_to_float( of2, o2 ); + stbir__simdi_convert_i32_to_float( of3, o3 ); + stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) ); + stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) ); + stbir__simdf_mult( of2, of2, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) ); + stbir__simdf_mult( of3, of3, STBIR__CONSTF(STBIR_max_uint8_as_float_inverted) ); + stbir__decode_simdf4_flip( of0 ); + stbir__decode_simdf4_flip( of1 ); + stbir__decode_simdf4_flip( of2 ); + stbir__decode_simdf4_flip( of3 ); + stbir__simdf_store( decode + 0, of0 ); + stbir__simdf_store( decode + 4, of1 ); + stbir__simdf_store( decode + 8, of2 ); + stbir__simdf_store( decode + 12, of3 ); + #endif + decode += 16; + input += 16; + if ( decode <= decode_end ) + continue; + if ( decode == ( decode_end + 16 ) ) + break; + decode = decode_end; // backup and do last couple + input = end_input_m16; + } + return decode_end + 16; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode <= decode_end ) + { + STBIR_SIMD_NO_UNROLL(decode); + decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted; + decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted; + decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted; + decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint8_as_float_inverted; + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint8_as_float_inverted; + #if stbir__coder_min_num >= 2 + decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint8_as_float_inverted; + #endif + #if stbir__coder_min_num >= 3 + decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint8_as_float_inverted; + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + + return decode_end; +} + +static void STBIR__CODER_NAME( stbir__encode_uint8_linear_scaled )( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp; + unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels; + + #ifdef STBIR_SIMD + if ( width_times_channels >= stbir__simdfX_float_count*2 ) + { + float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; + end_output -= stbir__simdfX_float_count*2; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdfX e0, e1; + stbir__simdi i; + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode ); + stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint8_as_floatX, encode+stbir__simdfX_float_count ); + stbir__encode_simdfX_unflip( e0 ); + stbir__encode_simdfX_unflip( e1 ); + #ifdef STBIR_SIMD8 + stbir__simdf8_pack_to_16bytes( i, e0, e1 ); + stbir__simdi_store( output, i ); + #else + stbir__simdf_pack_to_8bytes( i, e0, e1 ); + stbir__simdi_store2( output, i ); + #endif + encode += stbir__simdfX_float_count*2; + output += stbir__simdfX_float_count*2; + if ( output <= end_output ) + continue; + if ( output == ( end_output + stbir__simdfX_float_count*2 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m8; + } + return; + } + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + stbir__simdf e0; + stbir__simdi i0; + STBIR_NO_UNROLL(encode); + stbir__simdf_load( e0, encode ); + stbir__simdf_madd( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), e0 ); + stbir__encode_simdf4_unflip( e0 ); + stbir__simdf_pack_to_8bytes( i0, e0, e0 ); // only use first 4 + *(int*)(output-4) = stbir__simdi_to_int( i0 ); + output += 4; + encode += 4; + } + output -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + stbir__simdf e0; + STBIR_NO_UNROLL(encode); + stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_uint8( e0 ); + #if stbir__coder_min_num >= 2 + stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_uint8( e0 ); + #endif + #if stbir__coder_min_num >= 3 + stbir__simdf_madd1_mem( e0, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint8_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_uint8( e0 ); + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif + + #else + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + while( output <= end_output ) + { + float f; + f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f; + f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f; + f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f; + f = encode[stbir__encode_order3] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f; + output += 4; + encode += 4; + } + output -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + float f; + STBIR_NO_UNROLL(encode); + f = encode[stbir__encode_order0] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f; + #if stbir__coder_min_num >= 2 + f = encode[stbir__encode_order1] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f; + #endif + #if stbir__coder_min_num >= 3 + f = encode[stbir__encode_order2] * stbir__max_uint8_as_float + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f; + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif + #endif +} + +static float * STBIR__CODER_NAME(stbir__decode_uint8_linear)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned char const * input = (unsigned char const*)inputp; + + #ifdef STBIR_SIMD + unsigned char const * end_input_m16 = input + width_times_channels - 16; + if ( width_times_channels >= 16 ) + { + decode_end -= 16; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + #ifdef STBIR_SIMD8 + stbir__simdi i; stbir__simdi8 o0,o1; + stbir__simdf8 of0, of1; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi8_expand_u8_to_u32( o0, o1, i ); + stbir__simdi8_convert_i32_to_float( of0, o0 ); + stbir__simdi8_convert_i32_to_float( of1, o1 ); + stbir__decode_simdf8_flip( of0 ); + stbir__decode_simdf8_flip( of1 ); + stbir__simdf8_store( decode + 0, of0 ); + stbir__simdf8_store( decode + 8, of1 ); + #else + stbir__simdi i, o0, o1, o2, o3; + stbir__simdf of0, of1, of2, of3; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi_expand_u8_to_u32( o0,o1,o2,o3,i); + stbir__simdi_convert_i32_to_float( of0, o0 ); + stbir__simdi_convert_i32_to_float( of1, o1 ); + stbir__simdi_convert_i32_to_float( of2, o2 ); + stbir__simdi_convert_i32_to_float( of3, o3 ); + stbir__decode_simdf4_flip( of0 ); + stbir__decode_simdf4_flip( of1 ); + stbir__decode_simdf4_flip( of2 ); + stbir__decode_simdf4_flip( of3 ); + stbir__simdf_store( decode + 0, of0 ); + stbir__simdf_store( decode + 4, of1 ); + stbir__simdf_store( decode + 8, of2 ); + stbir__simdf_store( decode + 12, of3 ); +#endif + decode += 16; + input += 16; + if ( decode <= decode_end ) + continue; + if ( decode == ( decode_end + 16 ) ) + break; + decode = decode_end; // backup and do last couple + input = end_input_m16; + } + return decode_end + 16; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode <= decode_end ) + { + STBIR_SIMD_NO_UNROLL(decode); + decode[0-4] = ((float)(input[stbir__decode_order0])); + decode[1-4] = ((float)(input[stbir__decode_order1])); + decode[2-4] = ((float)(input[stbir__decode_order2])); + decode[3-4] = ((float)(input[stbir__decode_order3])); + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = ((float)(input[stbir__decode_order0])); + #if stbir__coder_min_num >= 2 + decode[1] = ((float)(input[stbir__decode_order1])); + #endif + #if stbir__coder_min_num >= 3 + decode[2] = ((float)(input[stbir__decode_order2])); + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + return decode_end; +} + +static void STBIR__CODER_NAME( stbir__encode_uint8_linear )( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char *) outputp; + unsigned char * end_output = ( (unsigned char *) output ) + width_times_channels; + + #ifdef STBIR_SIMD + if ( width_times_channels >= stbir__simdfX_float_count*2 ) + { + float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; + end_output -= stbir__simdfX_float_count*2; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdfX e0, e1; + stbir__simdi i; + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode ); + stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count ); + stbir__encode_simdfX_unflip( e0 ); + stbir__encode_simdfX_unflip( e1 ); + #ifdef STBIR_SIMD8 + stbir__simdf8_pack_to_16bytes( i, e0, e1 ); + stbir__simdi_store( output, i ); + #else + stbir__simdf_pack_to_8bytes( i, e0, e1 ); + stbir__simdi_store2( output, i ); + #endif + encode += stbir__simdfX_float_count*2; + output += stbir__simdfX_float_count*2; + if ( output <= end_output ) + continue; + if ( output == ( end_output + stbir__simdfX_float_count*2 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m8; + } + return; + } + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + stbir__simdf e0; + stbir__simdi i0; + STBIR_NO_UNROLL(encode); + stbir__simdf_load( e0, encode ); + stbir__simdf_add( e0, STBIR__CONSTF(STBIR_simd_point5), e0 ); + stbir__encode_simdf4_unflip( e0 ); + stbir__simdf_pack_to_8bytes( i0, e0, e0 ); // only use first 4 + *(int*)(output-4) = stbir__simdi_to_int( i0 ); + output += 4; + encode += 4; + } + output -= 4; + #endif + + #else + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + while( output <= end_output ) + { + float f; + f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0-4] = (unsigned char)f; + f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1-4] = (unsigned char)f; + f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2-4] = (unsigned char)f; + f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 255); output[3-4] = (unsigned char)f; + output += 4; + encode += 4; + } + output -= 4; + #endif + + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + float f; + STBIR_NO_UNROLL(encode); + f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 255); output[0] = (unsigned char)f; + #if stbir__coder_min_num >= 2 + f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 255); output[1] = (unsigned char)f; + #endif + #if stbir__coder_min_num >= 3 + f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 255); output[2] = (unsigned char)f; + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif +} + +static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned char const * input = (unsigned char const *)inputp; + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + while( decode <= decode_end ) + { + decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ]; + decode[1-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ]; + decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ]; + decode[3-4] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order3 ] ]; + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order0 ] ]; + #if stbir__coder_min_num >= 2 + decode[1] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order1 ] ]; + #endif + #if stbir__coder_min_num >= 3 + decode[2] = stbir__srgb_uchar_to_linear_float[ input[ stbir__decode_order2 ] ]; + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + return decode_end; +} + +#define stbir__min_max_shift20( i, f ) \ + stbir__simdf_max( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_zero )) ); \ + stbir__simdf_min( f, f, stbir_simdf_casti(STBIR__CONSTI( STBIR_almost_one )) ); \ + stbir__simdi_32shr( i, stbir_simdi_castf( f ), 20 ); + +#define stbir__scale_and_convert( i, f ) \ + stbir__simdf_madd( f, STBIR__CONSTF( STBIR_simd_point5 ), STBIR__CONSTF( STBIR_max_uint8_as_float ), f ); \ + stbir__simdf_max( f, f, stbir__simdf_zeroP() ); \ + stbir__simdf_min( f, f, STBIR__CONSTF( STBIR_max_uint8_as_float ) ); \ + stbir__simdf_convert_float_to_i32( i, f ); + +#define stbir__linear_to_srgb_finish( i, f ) \ +{ \ + stbir__simdi temp; \ + stbir__simdi_32shr( temp, stbir_simdi_castf( f ), 12 ) ; \ + stbir__simdi_and( temp, temp, STBIR__CONSTI(STBIR_mastissa_mask) ); \ + stbir__simdi_or( temp, temp, STBIR__CONSTI(STBIR_topscale) ); \ + stbir__simdi_16madd( i, i, temp ); \ + stbir__simdi_32shr( i, i, 16 ); \ +} + +#define stbir__simdi_table_lookup2( v0,v1, table ) \ +{ \ + stbir__simdi_u32 temp0,temp1; \ + temp0.m128i_i128 = v0; \ + temp1.m128i_i128 = v1; \ + temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \ + temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \ + v0 = temp0.m128i_i128; \ + v1 = temp1.m128i_i128; \ +} + +#define stbir__simdi_table_lookup3( v0,v1,v2, table ) \ +{ \ + stbir__simdi_u32 temp0,temp1,temp2; \ + temp0.m128i_i128 = v0; \ + temp1.m128i_i128 = v1; \ + temp2.m128i_i128 = v2; \ + temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \ + temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \ + temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \ + v0 = temp0.m128i_i128; \ + v1 = temp1.m128i_i128; \ + v2 = temp2.m128i_i128; \ +} + +#define stbir__simdi_table_lookup4( v0,v1,v2,v3, table ) \ +{ \ + stbir__simdi_u32 temp0,temp1,temp2,temp3; \ + temp0.m128i_i128 = v0; \ + temp1.m128i_i128 = v1; \ + temp2.m128i_i128 = v2; \ + temp3.m128i_i128 = v3; \ + temp0.m128i_u32[0] = table[temp0.m128i_i32[0]]; temp0.m128i_u32[1] = table[temp0.m128i_i32[1]]; temp0.m128i_u32[2] = table[temp0.m128i_i32[2]]; temp0.m128i_u32[3] = table[temp0.m128i_i32[3]]; \ + temp1.m128i_u32[0] = table[temp1.m128i_i32[0]]; temp1.m128i_u32[1] = table[temp1.m128i_i32[1]]; temp1.m128i_u32[2] = table[temp1.m128i_i32[2]]; temp1.m128i_u32[3] = table[temp1.m128i_i32[3]]; \ + temp2.m128i_u32[0] = table[temp2.m128i_i32[0]]; temp2.m128i_u32[1] = table[temp2.m128i_i32[1]]; temp2.m128i_u32[2] = table[temp2.m128i_i32[2]]; temp2.m128i_u32[3] = table[temp2.m128i_i32[3]]; \ + temp3.m128i_u32[0] = table[temp3.m128i_i32[0]]; temp3.m128i_u32[1] = table[temp3.m128i_i32[1]]; temp3.m128i_u32[2] = table[temp3.m128i_i32[2]]; temp3.m128i_u32[3] = table[temp3.m128i_i32[3]]; \ + v0 = temp0.m128i_i128; \ + v1 = temp1.m128i_i128; \ + v2 = temp2.m128i_i128; \ + v3 = temp3.m128i_i128; \ +} + +static void STBIR__CODER_NAME( stbir__encode_uint8_srgb )( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp; + unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels; + + #ifdef STBIR_SIMD + + if ( width_times_channels >= 16 ) + { + float const * end_encode_m16 = encode + width_times_channels - 16; + end_output -= 16; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdf f0, f1, f2, f3; + stbir__simdi i0, i1, i2, i3; + STBIR_SIMD_NO_UNROLL(encode); + + stbir__simdf_load4_transposed( f0, f1, f2, f3, encode ); + + stbir__min_max_shift20( i0, f0 ); + stbir__min_max_shift20( i1, f1 ); + stbir__min_max_shift20( i2, f2 ); + stbir__min_max_shift20( i3, f3 ); + + stbir__simdi_table_lookup4( i0, i1, i2, i3, ( fp32_to_srgb8_tab4 - (127-13)*8 ) ); + + stbir__linear_to_srgb_finish( i0, f0 ); + stbir__linear_to_srgb_finish( i1, f1 ); + stbir__linear_to_srgb_finish( i2, f2 ); + stbir__linear_to_srgb_finish( i3, f3 ); + + stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) ); + + encode += 16; + output += 16; + if ( output <= end_output ) + continue; + if ( output == ( end_output + 16 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m16; + } + return; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while ( output <= end_output ) + { + STBIR_SIMD_NO_UNROLL(encode); + + output[0-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] ); + output[1-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] ); + output[2-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] ); + output[3-4] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order3] ); + + output += 4; + encode += 4; + } + output -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + STBIR_NO_UNROLL(encode); + output[0] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order0] ); + #if stbir__coder_min_num >= 2 + output[1] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order1] ); + #endif + #if stbir__coder_min_num >= 3 + output[2] = stbir__linear_to_srgb_uchar( encode[stbir__encode_order2] ); + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif +} + +#if ( stbir__coder_min_num == 4 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) ) + +static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb4_linearalpha)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned char const * input = (unsigned char const *)inputp; + + do { + decode[0] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ]; + decode[1] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order1] ]; + decode[2] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order2] ]; + decode[3] = ( (float) input[stbir__decode_order3] ) * stbir__max_uint8_as_float_inverted; + input += 4; + decode += 4; + } while( decode < decode_end ); + return decode_end; +} + + +static void STBIR__CODER_NAME( stbir__encode_uint8_srgb4_linearalpha )( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp; + unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels; + + #ifdef STBIR_SIMD + + if ( width_times_channels >= 16 ) + { + float const * end_encode_m16 = encode + width_times_channels - 16; + end_output -= 16; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdf f0, f1, f2, f3; + stbir__simdi i0, i1, i2, i3; + + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdf_load4_transposed( f0, f1, f2, f3, encode ); + + stbir__min_max_shift20( i0, f0 ); + stbir__min_max_shift20( i1, f1 ); + stbir__min_max_shift20( i2, f2 ); + stbir__scale_and_convert( i3, f3 ); + + stbir__simdi_table_lookup3( i0, i1, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) ); + + stbir__linear_to_srgb_finish( i0, f0 ); + stbir__linear_to_srgb_finish( i1, f1 ); + stbir__linear_to_srgb_finish( i2, f2 ); + + stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) ); + + output += 16; + encode += 16; + + if ( output <= end_output ) + continue; + if ( output == ( end_output + 16 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m16; + } + return; + } + #endif + + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float f; + STBIR_SIMD_NO_UNROLL(encode); + + output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] ); + output[stbir__decode_order1] = stbir__linear_to_srgb_uchar( encode[1] ); + output[stbir__decode_order2] = stbir__linear_to_srgb_uchar( encode[2] ); + + f = encode[3] * stbir__max_uint8_as_float + 0.5f; + STBIR_CLAMP(f, 0, 255); + output[stbir__decode_order3] = (unsigned char) f; + + output += 4; + encode += 4; + } while( output < end_output ); +} + +#endif + +#if ( stbir__coder_min_num == 2 ) || ( ( stbir__coder_min_num == 1 ) && ( !defined(stbir__decode_swizzle) ) ) + +static float * STBIR__CODER_NAME(stbir__decode_uint8_srgb2_linearalpha)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned char const * input = (unsigned char const *)inputp; + + decode += 4; + while( decode <= decode_end ) + { + decode[0-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0] ]; + decode[1-4] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted; + decode[2-4] = stbir__srgb_uchar_to_linear_float[ input[stbir__decode_order0+2] ]; + decode[3-4] = ( (float) input[stbir__decode_order1+2] ) * stbir__max_uint8_as_float_inverted; + input += 4; + decode += 4; + } + decode -= 4; + if( decode < decode_end ) + { + decode[0] = stbir__srgb_uchar_to_linear_float[ stbir__decode_order0 ]; + decode[1] = ( (float) input[stbir__decode_order1] ) * stbir__max_uint8_as_float_inverted; + } + return decode_end; +} + +static void STBIR__CODER_NAME( stbir__encode_uint8_srgb2_linearalpha )( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned char STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned char*) outputp; + unsigned char * end_output = ( (unsigned char*) output ) + width_times_channels; + + #ifdef STBIR_SIMD + + if ( width_times_channels >= 16 ) + { + float const * end_encode_m16 = encode + width_times_channels - 16; + end_output -= 16; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdf f0, f1, f2, f3; + stbir__simdi i0, i1, i2, i3; + + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdf_load4_transposed( f0, f1, f2, f3, encode ); + + stbir__min_max_shift20( i0, f0 ); + stbir__scale_and_convert( i1, f1 ); + stbir__min_max_shift20( i2, f2 ); + stbir__scale_and_convert( i3, f3 ); + + stbir__simdi_table_lookup2( i0, i2, ( fp32_to_srgb8_tab4 - (127-13)*8 ) ); + + stbir__linear_to_srgb_finish( i0, f0 ); + stbir__linear_to_srgb_finish( i2, f2 ); + + stbir__interleave_pack_and_store_16_u8( output, STBIR_strs_join1(i, ,stbir__encode_order0), STBIR_strs_join1(i, ,stbir__encode_order1), STBIR_strs_join1(i, ,stbir__encode_order2), STBIR_strs_join1(i, ,stbir__encode_order3) ); + + output += 16; + encode += 16; + if ( output <= end_output ) + continue; + if ( output == ( end_output + 16 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m16; + } + return; + } + #endif + + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float f; + STBIR_SIMD_NO_UNROLL(encode); + + output[stbir__decode_order0] = stbir__linear_to_srgb_uchar( encode[0] ); + + f = encode[1] * stbir__max_uint8_as_float + 0.5f; + STBIR_CLAMP(f, 0, 255); + output[stbir__decode_order1] = (unsigned char) f; + + output += 2; + encode += 2; + } while( output < end_output ); +} + +#endif + +static float * STBIR__CODER_NAME(stbir__decode_uint16_linear_scaled)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned short const * input = (unsigned short const *)inputp; + + #ifdef STBIR_SIMD + unsigned short const * end_input_m8 = input + width_times_channels - 8; + if ( width_times_channels >= 8 ) + { + decode_end -= 8; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + #ifdef STBIR_SIMD8 + stbir__simdi i; stbir__simdi8 o; + stbir__simdf8 of; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi8_expand_u16_to_u32( o, i ); + stbir__simdi8_convert_i32_to_float( of, o ); + stbir__simdf8_mult( of, of, STBIR_max_uint16_as_float_inverted8); + stbir__decode_simdf8_flip( of ); + stbir__simdf8_store( decode + 0, of ); + #else + stbir__simdi i, o0, o1; + stbir__simdf of0, of1; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi_expand_u16_to_u32( o0,o1,i ); + stbir__simdi_convert_i32_to_float( of0, o0 ); + stbir__simdi_convert_i32_to_float( of1, o1 ); + stbir__simdf_mult( of0, of0, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted) ); + stbir__simdf_mult( of1, of1, STBIR__CONSTF(STBIR_max_uint16_as_float_inverted)); + stbir__decode_simdf4_flip( of0 ); + stbir__decode_simdf4_flip( of1 ); + stbir__simdf_store( decode + 0, of0 ); + stbir__simdf_store( decode + 4, of1 ); + #endif + decode += 8; + input += 8; + if ( decode <= decode_end ) + continue; + if ( decode == ( decode_end + 8 ) ) + break; + decode = decode_end; // backup and do last couple + input = end_input_m8; + } + return decode_end + 8; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode <= decode_end ) + { + STBIR_SIMD_NO_UNROLL(decode); + decode[0-4] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted; + decode[1-4] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted; + decode[2-4] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted; + decode[3-4] = ((float)(input[stbir__decode_order3])) * stbir__max_uint16_as_float_inverted; + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = ((float)(input[stbir__decode_order0])) * stbir__max_uint16_as_float_inverted; + #if stbir__coder_min_num >= 2 + decode[1] = ((float)(input[stbir__decode_order1])) * stbir__max_uint16_as_float_inverted; + #endif + #if stbir__coder_min_num >= 3 + decode[2] = ((float)(input[stbir__decode_order2])) * stbir__max_uint16_as_float_inverted; + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + return decode_end; +} + + +static void STBIR__CODER_NAME(stbir__encode_uint16_linear_scaled)( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp; + unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels; + + #ifdef STBIR_SIMD + { + if ( width_times_channels >= stbir__simdfX_float_count*2 ) + { + float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; + end_output -= stbir__simdfX_float_count*2; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdfX e0, e1; + stbir__simdiX i; + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdfX_madd_mem( e0, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode ); + stbir__simdfX_madd_mem( e1, STBIR_simd_point5X, STBIR_max_uint16_as_floatX, encode+stbir__simdfX_float_count ); + stbir__encode_simdfX_unflip( e0 ); + stbir__encode_simdfX_unflip( e1 ); + stbir__simdfX_pack_to_words( i, e0, e1 ); + stbir__simdiX_store( output, i ); + encode += stbir__simdfX_float_count*2; + output += stbir__simdfX_float_count*2; + if ( output <= end_output ) + continue; + if ( output == ( end_output + stbir__simdfX_float_count*2 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m8; + } + return; + } + } + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + stbir__simdf e; + stbir__simdi i; + STBIR_NO_UNROLL(encode); + stbir__simdf_load( e, encode ); + stbir__simdf_madd( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), e ); + stbir__encode_simdf4_unflip( e ); + stbir__simdf_pack_to_8words( i, e, e ); // only use first 4 + stbir__simdi_store2( output-4, i ); + output += 4; + encode += 4; + } + output -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + stbir__simdf e; + STBIR_NO_UNROLL(encode); + stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order0 ); output[0] = stbir__simdf_convert_float_to_short( e ); + #if stbir__coder_min_num >= 2 + stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order1 ); output[1] = stbir__simdf_convert_float_to_short( e ); + #endif + #if stbir__coder_min_num >= 3 + stbir__simdf_madd1_mem( e, STBIR__CONSTF(STBIR_simd_point5), STBIR__CONSTF(STBIR_max_uint16_as_float), encode+stbir__encode_order2 ); output[2] = stbir__simdf_convert_float_to_short( e ); + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif + + #else + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + float f; + STBIR_SIMD_NO_UNROLL(encode); + f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f; + f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f; + f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f; + f = encode[stbir__encode_order3] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f; + output += 4; + encode += 4; + } + output -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + float f; + STBIR_NO_UNROLL(encode); + f = encode[stbir__encode_order0] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f; + #if stbir__coder_min_num >= 2 + f = encode[stbir__encode_order1] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f; + #endif + #if stbir__coder_min_num >= 3 + f = encode[stbir__encode_order2] * stbir__max_uint16_as_float + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f; + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif + #endif +} + +static float * STBIR__CODER_NAME(stbir__decode_uint16_linear)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + unsigned short const * input = (unsigned short const *)inputp; + + #ifdef STBIR_SIMD + unsigned short const * end_input_m8 = input + width_times_channels - 8; + if ( width_times_channels >= 8 ) + { + decode_end -= 8; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + #ifdef STBIR_SIMD8 + stbir__simdi i; stbir__simdi8 o; + stbir__simdf8 of; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi8_expand_u16_to_u32( o, i ); + stbir__simdi8_convert_i32_to_float( of, o ); + stbir__decode_simdf8_flip( of ); + stbir__simdf8_store( decode + 0, of ); + #else + stbir__simdi i, o0, o1; + stbir__simdf of0, of1; + STBIR_NO_UNROLL(decode); + stbir__simdi_load( i, input ); + stbir__simdi_expand_u16_to_u32( o0, o1, i ); + stbir__simdi_convert_i32_to_float( of0, o0 ); + stbir__simdi_convert_i32_to_float( of1, o1 ); + stbir__decode_simdf4_flip( of0 ); + stbir__decode_simdf4_flip( of1 ); + stbir__simdf_store( decode + 0, of0 ); + stbir__simdf_store( decode + 4, of1 ); + #endif + decode += 8; + input += 8; + if ( decode <= decode_end ) + continue; + if ( decode == ( decode_end + 8 ) ) + break; + decode = decode_end; // backup and do last couple + input = end_input_m8; + } + return decode_end + 8; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode <= decode_end ) + { + STBIR_SIMD_NO_UNROLL(decode); + decode[0-4] = ((float)(input[stbir__decode_order0])); + decode[1-4] = ((float)(input[stbir__decode_order1])); + decode[2-4] = ((float)(input[stbir__decode_order2])); + decode[3-4] = ((float)(input[stbir__decode_order3])); + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = ((float)(input[stbir__decode_order0])); + #if stbir__coder_min_num >= 2 + decode[1] = ((float)(input[stbir__decode_order1])); + #endif + #if stbir__coder_min_num >= 3 + decode[2] = ((float)(input[stbir__decode_order2])); + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + return decode_end; +} + +static void STBIR__CODER_NAME(stbir__encode_uint16_linear)( void * outputp, int width_times_channels, float const * encode ) +{ + unsigned short STBIR_SIMD_STREAMOUT_PTR( * ) output = (unsigned short*) outputp; + unsigned short * end_output = ( (unsigned short*) output ) + width_times_channels; + + #ifdef STBIR_SIMD + { + if ( width_times_channels >= stbir__simdfX_float_count*2 ) + { + float const * end_encode_m8 = encode + width_times_channels - stbir__simdfX_float_count*2; + end_output -= stbir__simdfX_float_count*2; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdfX e0, e1; + stbir__simdiX i; + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdfX_add_mem( e0, STBIR_simd_point5X, encode ); + stbir__simdfX_add_mem( e1, STBIR_simd_point5X, encode+stbir__simdfX_float_count ); + stbir__encode_simdfX_unflip( e0 ); + stbir__encode_simdfX_unflip( e1 ); + stbir__simdfX_pack_to_words( i, e0, e1 ); + stbir__simdiX_store( output, i ); + encode += stbir__simdfX_float_count*2; + output += stbir__simdfX_float_count*2; + if ( output <= end_output ) + continue; + if ( output == ( end_output + stbir__simdfX_float_count*2 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m8; + } + return; + } + } + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + stbir__simdf e; + stbir__simdi i; + STBIR_NO_UNROLL(encode); + stbir__simdf_load( e, encode ); + stbir__simdf_add( e, STBIR__CONSTF(STBIR_simd_point5), e ); + stbir__encode_simdf4_unflip( e ); + stbir__simdf_pack_to_8words( i, e, e ); // only use first 4 + stbir__simdi_store2( output-4, i ); + output += 4; + encode += 4; + } + output -= 4; + #endif + + #else + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + float f; + STBIR_SIMD_NO_UNROLL(encode); + f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0-4] = (unsigned short)f; + f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1-4] = (unsigned short)f; + f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2-4] = (unsigned short)f; + f = encode[stbir__encode_order3] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[3-4] = (unsigned short)f; + output += 4; + encode += 4; + } + output -= 4; + #endif + + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + float f; + STBIR_NO_UNROLL(encode); + f = encode[stbir__encode_order0] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[0] = (unsigned short)f; + #if stbir__coder_min_num >= 2 + f = encode[stbir__encode_order1] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[1] = (unsigned short)f; + #endif + #if stbir__coder_min_num >= 3 + f = encode[stbir__encode_order2] + 0.5f; STBIR_CLAMP(f, 0, 65535); output[2] = (unsigned short)f; + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif +} + +static float * STBIR__CODER_NAME(stbir__decode_half_float_linear)( float * decodep, int width_times_channels, void const * inputp ) +{ + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + stbir__FP16 const * input = (stbir__FP16 const *)inputp; + + #ifdef STBIR_SIMD + if ( width_times_channels >= 8 ) + { + stbir__FP16 const * end_input_m8 = input + width_times_channels - 8; + decode_end -= 8; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + STBIR_NO_UNROLL(decode); + + stbir__half_to_float_SIMD( decode, input ); + #ifdef stbir__decode_swizzle + #ifdef STBIR_SIMD8 + { + stbir__simdf8 of; + stbir__simdf8_load( of, decode ); + stbir__decode_simdf8_flip( of ); + stbir__simdf8_store( decode, of ); + } + #else + { + stbir__simdf of0,of1; + stbir__simdf_load( of0, decode ); + stbir__simdf_load( of1, decode+4 ); + stbir__decode_simdf4_flip( of0 ); + stbir__decode_simdf4_flip( of1 ); + stbir__simdf_store( decode, of0 ); + stbir__simdf_store( decode+4, of1 ); + } + #endif + #endif + decode += 8; + input += 8; + if ( decode <= decode_end ) + continue; + if ( decode == ( decode_end + 8 ) ) + break; + decode = decode_end; // backup and do last couple + input = end_input_m8; + } + return decode_end + 8; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode <= decode_end ) + { + STBIR_SIMD_NO_UNROLL(decode); + decode[0-4] = stbir__half_to_float(input[stbir__decode_order0]); + decode[1-4] = stbir__half_to_float(input[stbir__decode_order1]); + decode[2-4] = stbir__half_to_float(input[stbir__decode_order2]); + decode[3-4] = stbir__half_to_float(input[stbir__decode_order3]); + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = stbir__half_to_float(input[stbir__decode_order0]); + #if stbir__coder_min_num >= 2 + decode[1] = stbir__half_to_float(input[stbir__decode_order1]); + #endif + #if stbir__coder_min_num >= 3 + decode[2] = stbir__half_to_float(input[stbir__decode_order2]); + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + return decode_end; +} + +static void STBIR__CODER_NAME( stbir__encode_half_float_linear )( void * outputp, int width_times_channels, float const * encode ) +{ + stbir__FP16 STBIR_SIMD_STREAMOUT_PTR( * ) output = (stbir__FP16*) outputp; + stbir__FP16 * end_output = ( (stbir__FP16*) output ) + width_times_channels; + + #ifdef STBIR_SIMD + if ( width_times_channels >= 8 ) + { + float const * end_encode_m8 = encode + width_times_channels - 8; + end_output -= 8; + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + STBIR_SIMD_NO_UNROLL(encode); + #ifdef stbir__decode_swizzle + #ifdef STBIR_SIMD8 + { + stbir__simdf8 of; + stbir__simdf8_load( of, encode ); + stbir__encode_simdf8_unflip( of ); + stbir__float_to_half_SIMD( output, (float*)&of ); + } + #else + { + stbir__simdf of[2]; + stbir__simdf_load( of[0], encode ); + stbir__simdf_load( of[1], encode+4 ); + stbir__encode_simdf4_unflip( of[0] ); + stbir__encode_simdf4_unflip( of[1] ); + stbir__float_to_half_SIMD( output, (float*)of ); + } + #endif + #else + stbir__float_to_half_SIMD( output, encode ); + #endif + encode += 8; + output += 8; + if ( output <= end_output ) + continue; + if ( output == ( end_output + 8 ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m8; + } + return; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + STBIR_SIMD_NO_UNROLL(output); + output[0-4] = stbir__float_to_half(encode[stbir__encode_order0]); + output[1-4] = stbir__float_to_half(encode[stbir__encode_order1]); + output[2-4] = stbir__float_to_half(encode[stbir__encode_order2]); + output[3-4] = stbir__float_to_half(encode[stbir__encode_order3]); + output += 4; + encode += 4; + } + output -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + STBIR_NO_UNROLL(output); + output[0] = stbir__float_to_half(encode[stbir__encode_order0]); + #if stbir__coder_min_num >= 2 + output[1] = stbir__float_to_half(encode[stbir__encode_order1]); + #endif + #if stbir__coder_min_num >= 3 + output[2] = stbir__float_to_half(encode[stbir__encode_order2]); + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif +} + +static float * STBIR__CODER_NAME(stbir__decode_float_linear)( float * decodep, int width_times_channels, void const * inputp ) +{ + #ifdef stbir__decode_swizzle + float STBIR_STREAMOUT_PTR( * ) decode = decodep; + float * decode_end = (float*) decode + width_times_channels; + float const * input = (float const *)inputp; + + #ifdef STBIR_SIMD + if ( width_times_channels >= 16 ) + { + float const * end_input_m16 = input + width_times_channels - 16; + decode_end -= 16; + STBIR_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + STBIR_NO_UNROLL(decode); + #ifdef stbir__decode_swizzle + #ifdef STBIR_SIMD8 + { + stbir__simdf8 of0,of1; + stbir__simdf8_load( of0, input ); + stbir__simdf8_load( of1, input+8 ); + stbir__decode_simdf8_flip( of0 ); + stbir__decode_simdf8_flip( of1 ); + stbir__simdf8_store( decode, of0 ); + stbir__simdf8_store( decode+8, of1 ); + } + #else + { + stbir__simdf of0,of1,of2,of3; + stbir__simdf_load( of0, input ); + stbir__simdf_load( of1, input+4 ); + stbir__simdf_load( of2, input+8 ); + stbir__simdf_load( of3, input+12 ); + stbir__decode_simdf4_flip( of0 ); + stbir__decode_simdf4_flip( of1 ); + stbir__decode_simdf4_flip( of2 ); + stbir__decode_simdf4_flip( of3 ); + stbir__simdf_store( decode, of0 ); + stbir__simdf_store( decode+4, of1 ); + stbir__simdf_store( decode+8, of2 ); + stbir__simdf_store( decode+12, of3 ); + } + #endif + #endif + decode += 16; + input += 16; + if ( decode <= decode_end ) + continue; + if ( decode == ( decode_end + 16 ) ) + break; + decode = decode_end; // backup and do last couple + input = end_input_m16; + } + return decode_end + 16; + } + #endif + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + decode += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( decode <= decode_end ) + { + STBIR_SIMD_NO_UNROLL(decode); + decode[0-4] = input[stbir__decode_order0]; + decode[1-4] = input[stbir__decode_order1]; + decode[2-4] = input[stbir__decode_order2]; + decode[3-4] = input[stbir__decode_order3]; + decode += 4; + input += 4; + } + decode -= 4; + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( decode < decode_end ) + { + STBIR_NO_UNROLL(decode); + decode[0] = input[stbir__decode_order0]; + #if stbir__coder_min_num >= 2 + decode[1] = input[stbir__decode_order1]; + #endif + #if stbir__coder_min_num >= 3 + decode[2] = input[stbir__decode_order2]; + #endif + decode += stbir__coder_min_num; + input += stbir__coder_min_num; + } + #endif + return decode_end; + + #else + + if ( (void*)decodep != inputp ) + STBIR_MEMCPY( decodep, inputp, width_times_channels * sizeof( float ) ); + + return decodep + width_times_channels; + + #endif +} + +static void STBIR__CODER_NAME( stbir__encode_float_linear )( void * outputp, int width_times_channels, float const * encode ) +{ + #if !defined( STBIR_FLOAT_HIGH_CLAMP ) && !defined(STBIR_FLOAT_LO_CLAMP) && !defined(stbir__decode_swizzle) + + if ( (void*)outputp != (void*) encode ) + STBIR_MEMCPY( outputp, encode, width_times_channels * sizeof( float ) ); + + #else + + float STBIR_SIMD_STREAMOUT_PTR( * ) output = (float*) outputp; + float * end_output = ( (float*) output ) + width_times_channels; + + #ifdef STBIR_FLOAT_HIGH_CLAMP + #define stbir_scalar_hi_clamp( v ) if ( v > STBIR_FLOAT_HIGH_CLAMP ) v = STBIR_FLOAT_HIGH_CLAMP; + #else + #define stbir_scalar_hi_clamp( v ) + #endif + #ifdef STBIR_FLOAT_LOW_CLAMP + #define stbir_scalar_lo_clamp( v ) if ( v < STBIR_FLOAT_LOW_CLAMP ) v = STBIR_FLOAT_LOW_CLAMP; + #else + #define stbir_scalar_lo_clamp( v ) + #endif + + #ifdef STBIR_SIMD + + #ifdef STBIR_FLOAT_HIGH_CLAMP + const stbir__simdfX high_clamp = stbir__simdf_frepX(STBIR_FLOAT_HIGH_CLAMP); + #endif + #ifdef STBIR_FLOAT_LOW_CLAMP + const stbir__simdfX low_clamp = stbir__simdf_frepX(STBIR_FLOAT_LOW_CLAMP); + #endif + + if ( width_times_channels >= ( stbir__simdfX_float_count * 2 ) ) + { + float const * end_encode_m8 = encode + width_times_channels - ( stbir__simdfX_float_count * 2 ); + end_output -= ( stbir__simdfX_float_count * 2 ); + STBIR_SIMD_NO_UNROLL_LOOP_START_INF_FOR + for(;;) + { + stbir__simdfX e0, e1; + STBIR_SIMD_NO_UNROLL(encode); + stbir__simdfX_load( e0, encode ); + stbir__simdfX_load( e1, encode+stbir__simdfX_float_count ); +#ifdef STBIR_FLOAT_HIGH_CLAMP + stbir__simdfX_min( e0, e0, high_clamp ); + stbir__simdfX_min( e1, e1, high_clamp ); +#endif +#ifdef STBIR_FLOAT_LOW_CLAMP + stbir__simdfX_max( e0, e0, low_clamp ); + stbir__simdfX_max( e1, e1, low_clamp ); +#endif + stbir__encode_simdfX_unflip( e0 ); + stbir__encode_simdfX_unflip( e1 ); + stbir__simdfX_store( output, e0 ); + stbir__simdfX_store( output+stbir__simdfX_float_count, e1 ); + encode += stbir__simdfX_float_count * 2; + output += stbir__simdfX_float_count * 2; + if ( output < end_output ) + continue; + if ( output == ( end_output + ( stbir__simdfX_float_count * 2 ) ) ) + break; + output = end_output; // backup and do last couple + encode = end_encode_m8; + } + return; + } + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + stbir__simdf e0; + STBIR_NO_UNROLL(encode); + stbir__simdf_load( e0, encode ); +#ifdef STBIR_FLOAT_HIGH_CLAMP + stbir__simdf_min( e0, e0, high_clamp ); +#endif +#ifdef STBIR_FLOAT_LOW_CLAMP + stbir__simdf_max( e0, e0, low_clamp ); +#endif + stbir__encode_simdf4_unflip( e0 ); + stbir__simdf_store( output-4, e0 ); + output += 4; + encode += 4; + } + output -= 4; + #endif + + #else + + // try to do blocks of 4 when you can + #if stbir__coder_min_num != 3 // doesn't divide cleanly by four + output += 4; + STBIR_SIMD_NO_UNROLL_LOOP_START + while( output <= end_output ) + { + float e; + STBIR_SIMD_NO_UNROLL(encode); + e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0-4] = e; + e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1-4] = e; + e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2-4] = e; + e = encode[ stbir__encode_order3 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[3-4] = e; + output += 4; + encode += 4; + } + output -= 4; + + #endif + + #endif + + // do the remnants + #if stbir__coder_min_num < 4 + STBIR_NO_UNROLL_LOOP_START + while( output < end_output ) + { + float e; + STBIR_NO_UNROLL(encode); + e = encode[ stbir__encode_order0 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[0] = e; + #if stbir__coder_min_num >= 2 + e = encode[ stbir__encode_order1 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[1] = e; + #endif + #if stbir__coder_min_num >= 3 + e = encode[ stbir__encode_order2 ]; stbir_scalar_hi_clamp( e ); stbir_scalar_lo_clamp( e ); output[2] = e; + #endif + output += stbir__coder_min_num; + encode += stbir__coder_min_num; + } + #endif + + #endif +} + +#undef stbir__decode_suffix +#undef stbir__decode_simdf8_flip +#undef stbir__decode_simdf4_flip +#undef stbir__decode_order0 +#undef stbir__decode_order1 +#undef stbir__decode_order2 +#undef stbir__decode_order3 +#undef stbir__encode_order0 +#undef stbir__encode_order1 +#undef stbir__encode_order2 +#undef stbir__encode_order3 +#undef stbir__encode_simdf8_unflip +#undef stbir__encode_simdf4_unflip +#undef stbir__encode_simdfX_unflip +#undef STBIR__CODER_NAME +#undef stbir__coder_min_num +#undef stbir__decode_swizzle +#undef stbir_scalar_hi_clamp +#undef stbir_scalar_lo_clamp +#undef STB_IMAGE_RESIZE_DO_CODERS + +#elif defined( STB_IMAGE_RESIZE_DO_VERTICALS) + +#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#define STBIR_chans( start, end ) STBIR_strs_join14(start,STBIR__vertical_channels,end,_cont) +#else +#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__vertical_channels,end) +#endif + +#if STBIR__vertical_channels >= 1 +#define stbIF0( code ) code +#else +#define stbIF0( code ) +#endif +#if STBIR__vertical_channels >= 2 +#define stbIF1( code ) code +#else +#define stbIF1( code ) +#endif +#if STBIR__vertical_channels >= 3 +#define stbIF2( code ) code +#else +#define stbIF2( code ) +#endif +#if STBIR__vertical_channels >= 4 +#define stbIF3( code ) code +#else +#define stbIF3( code ) +#endif +#if STBIR__vertical_channels >= 5 +#define stbIF4( code ) code +#else +#define stbIF4( code ) +#endif +#if STBIR__vertical_channels >= 6 +#define stbIF5( code ) code +#else +#define stbIF5( code ) +#endif +#if STBIR__vertical_channels >= 7 +#define stbIF6( code ) code +#else +#define stbIF6( code ) +#endif +#if STBIR__vertical_channels >= 8 +#define stbIF7( code ) code +#else +#define stbIF7( code ) +#endif + +static void STBIR_chans( stbir__vertical_scatter_with_,_coeffs)( float ** outputs, float const * vertical_coefficients, float const * input, float const * input_end ) +{ + stbIF0( float STBIR_SIMD_STREAMOUT_PTR( * ) output0 = outputs[0]; float c0s = vertical_coefficients[0]; ) + stbIF1( float STBIR_SIMD_STREAMOUT_PTR( * ) output1 = outputs[1]; float c1s = vertical_coefficients[1]; ) + stbIF2( float STBIR_SIMD_STREAMOUT_PTR( * ) output2 = outputs[2]; float c2s = vertical_coefficients[2]; ) + stbIF3( float STBIR_SIMD_STREAMOUT_PTR( * ) output3 = outputs[3]; float c3s = vertical_coefficients[3]; ) + stbIF4( float STBIR_SIMD_STREAMOUT_PTR( * ) output4 = outputs[4]; float c4s = vertical_coefficients[4]; ) + stbIF5( float STBIR_SIMD_STREAMOUT_PTR( * ) output5 = outputs[5]; float c5s = vertical_coefficients[5]; ) + stbIF6( float STBIR_SIMD_STREAMOUT_PTR( * ) output6 = outputs[6]; float c6s = vertical_coefficients[6]; ) + stbIF7( float STBIR_SIMD_STREAMOUT_PTR( * ) output7 = outputs[7]; float c7s = vertical_coefficients[7]; ) + + #ifdef STBIR_SIMD + { + stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); ) + stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); ) + stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); ) + stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); ) + stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); ) + stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); ) + stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); ) + stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); ) + STBIR_SIMD_NO_UNROLL_LOOP_START + while ( ( (char*)input_end - (char*) input ) >= (16*stbir__simdfX_float_count) ) + { + stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3; + STBIR_SIMD_NO_UNROLL(output0); + + stbir__simdfX_load( r0, input ); stbir__simdfX_load( r1, input+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input+(3*stbir__simdfX_float_count) ); + + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( stbir__simdfX_load( o0, output0 ); stbir__simdfX_load( o1, output0+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output0+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c0 ); stbir__simdfX_madd( o1, o1, r1, c0 ); stbir__simdfX_madd( o2, o2, r2, c0 ); stbir__simdfX_madd( o3, o3, r3, c0 ); + stbir__simdfX_store( output0, o0 ); stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); ) + stbIF1( stbir__simdfX_load( o0, output1 ); stbir__simdfX_load( o1, output1+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output1+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output1+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c1 ); stbir__simdfX_madd( o1, o1, r1, c1 ); stbir__simdfX_madd( o2, o2, r2, c1 ); stbir__simdfX_madd( o3, o3, r3, c1 ); + stbir__simdfX_store( output1, o0 ); stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); ) + stbIF2( stbir__simdfX_load( o0, output2 ); stbir__simdfX_load( o1, output2+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output2+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output2+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c2 ); stbir__simdfX_madd( o1, o1, r1, c2 ); stbir__simdfX_madd( o2, o2, r2, c2 ); stbir__simdfX_madd( o3, o3, r3, c2 ); + stbir__simdfX_store( output2, o0 ); stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); ) + stbIF3( stbir__simdfX_load( o0, output3 ); stbir__simdfX_load( o1, output3+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output3+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output3+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c3 ); stbir__simdfX_madd( o1, o1, r1, c3 ); stbir__simdfX_madd( o2, o2, r2, c3 ); stbir__simdfX_madd( o3, o3, r3, c3 ); + stbir__simdfX_store( output3, o0 ); stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); ) + stbIF4( stbir__simdfX_load( o0, output4 ); stbir__simdfX_load( o1, output4+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output4+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output4+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c4 ); stbir__simdfX_madd( o1, o1, r1, c4 ); stbir__simdfX_madd( o2, o2, r2, c4 ); stbir__simdfX_madd( o3, o3, r3, c4 ); + stbir__simdfX_store( output4, o0 ); stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); ) + stbIF5( stbir__simdfX_load( o0, output5 ); stbir__simdfX_load( o1, output5+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output5+(2*stbir__simdfX_float_count)); stbir__simdfX_load( o3, output5+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c5 ); stbir__simdfX_madd( o1, o1, r1, c5 ); stbir__simdfX_madd( o2, o2, r2, c5 ); stbir__simdfX_madd( o3, o3, r3, c5 ); + stbir__simdfX_store( output5, o0 ); stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); ) + stbIF6( stbir__simdfX_load( o0, output6 ); stbir__simdfX_load( o1, output6+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output6+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output6+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c6 ); stbir__simdfX_madd( o1, o1, r1, c6 ); stbir__simdfX_madd( o2, o2, r2, c6 ); stbir__simdfX_madd( o3, o3, r3, c6 ); + stbir__simdfX_store( output6, o0 ); stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); ) + stbIF7( stbir__simdfX_load( o0, output7 ); stbir__simdfX_load( o1, output7+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output7+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output7+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c7 ); stbir__simdfX_madd( o1, o1, r1, c7 ); stbir__simdfX_madd( o2, o2, r2, c7 ); stbir__simdfX_madd( o3, o3, r3, c7 ); + stbir__simdfX_store( output7, o0 ); stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); ) + #else + stbIF0( stbir__simdfX_mult( o0, r0, c0 ); stbir__simdfX_mult( o1, r1, c0 ); stbir__simdfX_mult( o2, r2, c0 ); stbir__simdfX_mult( o3, r3, c0 ); + stbir__simdfX_store( output0, o0 ); stbir__simdfX_store( output0+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output0+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output0+(3*stbir__simdfX_float_count), o3 ); ) + stbIF1( stbir__simdfX_mult( o0, r0, c1 ); stbir__simdfX_mult( o1, r1, c1 ); stbir__simdfX_mult( o2, r2, c1 ); stbir__simdfX_mult( o3, r3, c1 ); + stbir__simdfX_store( output1, o0 ); stbir__simdfX_store( output1+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output1+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output1+(3*stbir__simdfX_float_count), o3 ); ) + stbIF2( stbir__simdfX_mult( o0, r0, c2 ); stbir__simdfX_mult( o1, r1, c2 ); stbir__simdfX_mult( o2, r2, c2 ); stbir__simdfX_mult( o3, r3, c2 ); + stbir__simdfX_store( output2, o0 ); stbir__simdfX_store( output2+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output2+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output2+(3*stbir__simdfX_float_count), o3 ); ) + stbIF3( stbir__simdfX_mult( o0, r0, c3 ); stbir__simdfX_mult( o1, r1, c3 ); stbir__simdfX_mult( o2, r2, c3 ); stbir__simdfX_mult( o3, r3, c3 ); + stbir__simdfX_store( output3, o0 ); stbir__simdfX_store( output3+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output3+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output3+(3*stbir__simdfX_float_count), o3 ); ) + stbIF4( stbir__simdfX_mult( o0, r0, c4 ); stbir__simdfX_mult( o1, r1, c4 ); stbir__simdfX_mult( o2, r2, c4 ); stbir__simdfX_mult( o3, r3, c4 ); + stbir__simdfX_store( output4, o0 ); stbir__simdfX_store( output4+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output4+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output4+(3*stbir__simdfX_float_count), o3 ); ) + stbIF5( stbir__simdfX_mult( o0, r0, c5 ); stbir__simdfX_mult( o1, r1, c5 ); stbir__simdfX_mult( o2, r2, c5 ); stbir__simdfX_mult( o3, r3, c5 ); + stbir__simdfX_store( output5, o0 ); stbir__simdfX_store( output5+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output5+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output5+(3*stbir__simdfX_float_count), o3 ); ) + stbIF6( stbir__simdfX_mult( o0, r0, c6 ); stbir__simdfX_mult( o1, r1, c6 ); stbir__simdfX_mult( o2, r2, c6 ); stbir__simdfX_mult( o3, r3, c6 ); + stbir__simdfX_store( output6, o0 ); stbir__simdfX_store( output6+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output6+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output6+(3*stbir__simdfX_float_count), o3 ); ) + stbIF7( stbir__simdfX_mult( o0, r0, c7 ); stbir__simdfX_mult( o1, r1, c7 ); stbir__simdfX_mult( o2, r2, c7 ); stbir__simdfX_mult( o3, r3, c7 ); + stbir__simdfX_store( output7, o0 ); stbir__simdfX_store( output7+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output7+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output7+(3*stbir__simdfX_float_count), o3 ); ) + #endif + + input += (4*stbir__simdfX_float_count); + stbIF0( output0 += (4*stbir__simdfX_float_count); ) stbIF1( output1 += (4*stbir__simdfX_float_count); ) stbIF2( output2 += (4*stbir__simdfX_float_count); ) stbIF3( output3 += (4*stbir__simdfX_float_count); ) stbIF4( output4 += (4*stbir__simdfX_float_count); ) stbIF5( output5 += (4*stbir__simdfX_float_count); ) stbIF6( output6 += (4*stbir__simdfX_float_count); ) stbIF7( output7 += (4*stbir__simdfX_float_count); ) + } + STBIR_SIMD_NO_UNROLL_LOOP_START + while ( ( (char*)input_end - (char*) input ) >= 16 ) + { + stbir__simdf o0, r0; + STBIR_SIMD_NO_UNROLL(output0); + + stbir__simdf_load( r0, input ); + + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( stbir__simdf_load( o0, output0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); stbir__simdf_store( output0, o0 ); ) + stbIF1( stbir__simdf_load( o0, output1 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); stbir__simdf_store( output1, o0 ); ) + stbIF2( stbir__simdf_load( o0, output2 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); stbir__simdf_store( output2, o0 ); ) + stbIF3( stbir__simdf_load( o0, output3 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); stbir__simdf_store( output3, o0 ); ) + stbIF4( stbir__simdf_load( o0, output4 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); stbir__simdf_store( output4, o0 ); ) + stbIF5( stbir__simdf_load( o0, output5 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); stbir__simdf_store( output5, o0 ); ) + stbIF6( stbir__simdf_load( o0, output6 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); stbir__simdf_store( output6, o0 ); ) + stbIF7( stbir__simdf_load( o0, output7 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); stbir__simdf_store( output7, o0 ); ) + #else + stbIF0( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); stbir__simdf_store( output0, o0 ); ) + stbIF1( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); stbir__simdf_store( output1, o0 ); ) + stbIF2( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); stbir__simdf_store( output2, o0 ); ) + stbIF3( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); stbir__simdf_store( output3, o0 ); ) + stbIF4( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); stbir__simdf_store( output4, o0 ); ) + stbIF5( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); stbir__simdf_store( output5, o0 ); ) + stbIF6( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); stbir__simdf_store( output6, o0 ); ) + stbIF7( stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); stbir__simdf_store( output7, o0 ); ) + #endif + + input += 4; + stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; ) + } + } + #else + STBIR_NO_UNROLL_LOOP_START + while ( ( (char*)input_end - (char*) input ) >= 16 ) + { + float r0, r1, r2, r3; + STBIR_NO_UNROLL(input); + + r0 = input[0], r1 = input[1], r2 = input[2], r3 = input[3]; + + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( output0[0] += ( r0 * c0s ); output0[1] += ( r1 * c0s ); output0[2] += ( r2 * c0s ); output0[3] += ( r3 * c0s ); ) + stbIF1( output1[0] += ( r0 * c1s ); output1[1] += ( r1 * c1s ); output1[2] += ( r2 * c1s ); output1[3] += ( r3 * c1s ); ) + stbIF2( output2[0] += ( r0 * c2s ); output2[1] += ( r1 * c2s ); output2[2] += ( r2 * c2s ); output2[3] += ( r3 * c2s ); ) + stbIF3( output3[0] += ( r0 * c3s ); output3[1] += ( r1 * c3s ); output3[2] += ( r2 * c3s ); output3[3] += ( r3 * c3s ); ) + stbIF4( output4[0] += ( r0 * c4s ); output4[1] += ( r1 * c4s ); output4[2] += ( r2 * c4s ); output4[3] += ( r3 * c4s ); ) + stbIF5( output5[0] += ( r0 * c5s ); output5[1] += ( r1 * c5s ); output5[2] += ( r2 * c5s ); output5[3] += ( r3 * c5s ); ) + stbIF6( output6[0] += ( r0 * c6s ); output6[1] += ( r1 * c6s ); output6[2] += ( r2 * c6s ); output6[3] += ( r3 * c6s ); ) + stbIF7( output7[0] += ( r0 * c7s ); output7[1] += ( r1 * c7s ); output7[2] += ( r2 * c7s ); output7[3] += ( r3 * c7s ); ) + #else + stbIF0( output0[0] = ( r0 * c0s ); output0[1] = ( r1 * c0s ); output0[2] = ( r2 * c0s ); output0[3] = ( r3 * c0s ); ) + stbIF1( output1[0] = ( r0 * c1s ); output1[1] = ( r1 * c1s ); output1[2] = ( r2 * c1s ); output1[3] = ( r3 * c1s ); ) + stbIF2( output2[0] = ( r0 * c2s ); output2[1] = ( r1 * c2s ); output2[2] = ( r2 * c2s ); output2[3] = ( r3 * c2s ); ) + stbIF3( output3[0] = ( r0 * c3s ); output3[1] = ( r1 * c3s ); output3[2] = ( r2 * c3s ); output3[3] = ( r3 * c3s ); ) + stbIF4( output4[0] = ( r0 * c4s ); output4[1] = ( r1 * c4s ); output4[2] = ( r2 * c4s ); output4[3] = ( r3 * c4s ); ) + stbIF5( output5[0] = ( r0 * c5s ); output5[1] = ( r1 * c5s ); output5[2] = ( r2 * c5s ); output5[3] = ( r3 * c5s ); ) + stbIF6( output6[0] = ( r0 * c6s ); output6[1] = ( r1 * c6s ); output6[2] = ( r2 * c6s ); output6[3] = ( r3 * c6s ); ) + stbIF7( output7[0] = ( r0 * c7s ); output7[1] = ( r1 * c7s ); output7[2] = ( r2 * c7s ); output7[3] = ( r3 * c7s ); ) + #endif + + input += 4; + stbIF0( output0 += 4; ) stbIF1( output1 += 4; ) stbIF2( output2 += 4; ) stbIF3( output3 += 4; ) stbIF4( output4 += 4; ) stbIF5( output5 += 4; ) stbIF6( output6 += 4; ) stbIF7( output7 += 4; ) + } + #endif + STBIR_NO_UNROLL_LOOP_START + while ( input < input_end ) + { + float r = input[0]; + STBIR_NO_UNROLL(output0); + + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( output0[0] += ( r * c0s ); ) + stbIF1( output1[0] += ( r * c1s ); ) + stbIF2( output2[0] += ( r * c2s ); ) + stbIF3( output3[0] += ( r * c3s ); ) + stbIF4( output4[0] += ( r * c4s ); ) + stbIF5( output5[0] += ( r * c5s ); ) + stbIF6( output6[0] += ( r * c6s ); ) + stbIF7( output7[0] += ( r * c7s ); ) + #else + stbIF0( output0[0] = ( r * c0s ); ) + stbIF1( output1[0] = ( r * c1s ); ) + stbIF2( output2[0] = ( r * c2s ); ) + stbIF3( output3[0] = ( r * c3s ); ) + stbIF4( output4[0] = ( r * c4s ); ) + stbIF5( output5[0] = ( r * c5s ); ) + stbIF6( output6[0] = ( r * c6s ); ) + stbIF7( output7[0] = ( r * c7s ); ) + #endif + + ++input; + stbIF0( ++output0; ) stbIF1( ++output1; ) stbIF2( ++output2; ) stbIF3( ++output3; ) stbIF4( ++output4; ) stbIF5( ++output5; ) stbIF6( ++output6; ) stbIF7( ++output7; ) + } +} + +static void STBIR_chans( stbir__vertical_gather_with_,_coeffs)( float * outputp, float const * vertical_coefficients, float const ** inputs, float const * input0_end ) +{ + float STBIR_SIMD_STREAMOUT_PTR( * ) output = outputp; + + stbIF0( float const * input0 = inputs[0]; float c0s = vertical_coefficients[0]; ) + stbIF1( float const * input1 = inputs[1]; float c1s = vertical_coefficients[1]; ) + stbIF2( float const * input2 = inputs[2]; float c2s = vertical_coefficients[2]; ) + stbIF3( float const * input3 = inputs[3]; float c3s = vertical_coefficients[3]; ) + stbIF4( float const * input4 = inputs[4]; float c4s = vertical_coefficients[4]; ) + stbIF5( float const * input5 = inputs[5]; float c5s = vertical_coefficients[5]; ) + stbIF6( float const * input6 = inputs[6]; float c6s = vertical_coefficients[6]; ) + stbIF7( float const * input7 = inputs[7]; float c7s = vertical_coefficients[7]; ) + +#if ( STBIR__vertical_channels == 1 ) && !defined(STB_IMAGE_RESIZE_VERTICAL_CONTINUE) + // check single channel one weight + if ( ( c0s >= (1.0f-0.000001f) ) && ( c0s <= (1.0f+0.000001f) ) ) + { + STBIR_MEMCPY( output, input0, (char*)input0_end - (char*)input0 ); + return; + } +#endif + + #ifdef STBIR_SIMD + { + stbIF0(stbir__simdfX c0 = stbir__simdf_frepX( c0s ); ) + stbIF1(stbir__simdfX c1 = stbir__simdf_frepX( c1s ); ) + stbIF2(stbir__simdfX c2 = stbir__simdf_frepX( c2s ); ) + stbIF3(stbir__simdfX c3 = stbir__simdf_frepX( c3s ); ) + stbIF4(stbir__simdfX c4 = stbir__simdf_frepX( c4s ); ) + stbIF5(stbir__simdfX c5 = stbir__simdf_frepX( c5s ); ) + stbIF6(stbir__simdfX c6 = stbir__simdf_frepX( c6s ); ) + stbIF7(stbir__simdfX c7 = stbir__simdf_frepX( c7s ); ) + + STBIR_SIMD_NO_UNROLL_LOOP_START + while ( ( (char*)input0_end - (char*) input0 ) >= (16*stbir__simdfX_float_count) ) + { + stbir__simdfX o0, o1, o2, o3, r0, r1, r2, r3; + STBIR_SIMD_NO_UNROLL(output); + + // prefetch four loop iterations ahead (doesn't affect much for small resizes, but helps with big ones) + stbIF0( stbir__prefetch( input0 + (16*stbir__simdfX_float_count) ); ) + stbIF1( stbir__prefetch( input1 + (16*stbir__simdfX_float_count) ); ) + stbIF2( stbir__prefetch( input2 + (16*stbir__simdfX_float_count) ); ) + stbIF3( stbir__prefetch( input3 + (16*stbir__simdfX_float_count) ); ) + stbIF4( stbir__prefetch( input4 + (16*stbir__simdfX_float_count) ); ) + stbIF5( stbir__prefetch( input5 + (16*stbir__simdfX_float_count) ); ) + stbIF6( stbir__prefetch( input6 + (16*stbir__simdfX_float_count) ); ) + stbIF7( stbir__prefetch( input7 + (16*stbir__simdfX_float_count) ); ) + + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( stbir__simdfX_load( o0, output ); stbir__simdfX_load( o1, output+stbir__simdfX_float_count ); stbir__simdfX_load( o2, output+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( o3, output+(3*stbir__simdfX_float_count) ); + stbir__simdfX_load( r0, input0 ); stbir__simdfX_load( r1, input0+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c0 ); stbir__simdfX_madd( o1, o1, r1, c0 ); stbir__simdfX_madd( o2, o2, r2, c0 ); stbir__simdfX_madd( o3, o3, r3, c0 ); ) + #else + stbIF0( stbir__simdfX_load( r0, input0 ); stbir__simdfX_load( r1, input0+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input0+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input0+(3*stbir__simdfX_float_count) ); + stbir__simdfX_mult( o0, r0, c0 ); stbir__simdfX_mult( o1, r1, c0 ); stbir__simdfX_mult( o2, r2, c0 ); stbir__simdfX_mult( o3, r3, c0 ); ) + #endif + + stbIF1( stbir__simdfX_load( r0, input1 ); stbir__simdfX_load( r1, input1+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input1+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input1+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c1 ); stbir__simdfX_madd( o1, o1, r1, c1 ); stbir__simdfX_madd( o2, o2, r2, c1 ); stbir__simdfX_madd( o3, o3, r3, c1 ); ) + stbIF2( stbir__simdfX_load( r0, input2 ); stbir__simdfX_load( r1, input2+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input2+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input2+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c2 ); stbir__simdfX_madd( o1, o1, r1, c2 ); stbir__simdfX_madd( o2, o2, r2, c2 ); stbir__simdfX_madd( o3, o3, r3, c2 ); ) + stbIF3( stbir__simdfX_load( r0, input3 ); stbir__simdfX_load( r1, input3+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input3+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input3+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c3 ); stbir__simdfX_madd( o1, o1, r1, c3 ); stbir__simdfX_madd( o2, o2, r2, c3 ); stbir__simdfX_madd( o3, o3, r3, c3 ); ) + stbIF4( stbir__simdfX_load( r0, input4 ); stbir__simdfX_load( r1, input4+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input4+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input4+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c4 ); stbir__simdfX_madd( o1, o1, r1, c4 ); stbir__simdfX_madd( o2, o2, r2, c4 ); stbir__simdfX_madd( o3, o3, r3, c4 ); ) + stbIF5( stbir__simdfX_load( r0, input5 ); stbir__simdfX_load( r1, input5+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input5+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input5+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c5 ); stbir__simdfX_madd( o1, o1, r1, c5 ); stbir__simdfX_madd( o2, o2, r2, c5 ); stbir__simdfX_madd( o3, o3, r3, c5 ); ) + stbIF6( stbir__simdfX_load( r0, input6 ); stbir__simdfX_load( r1, input6+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input6+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input6+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c6 ); stbir__simdfX_madd( o1, o1, r1, c6 ); stbir__simdfX_madd( o2, o2, r2, c6 ); stbir__simdfX_madd( o3, o3, r3, c6 ); ) + stbIF7( stbir__simdfX_load( r0, input7 ); stbir__simdfX_load( r1, input7+stbir__simdfX_float_count ); stbir__simdfX_load( r2, input7+(2*stbir__simdfX_float_count) ); stbir__simdfX_load( r3, input7+(3*stbir__simdfX_float_count) ); + stbir__simdfX_madd( o0, o0, r0, c7 ); stbir__simdfX_madd( o1, o1, r1, c7 ); stbir__simdfX_madd( o2, o2, r2, c7 ); stbir__simdfX_madd( o3, o3, r3, c7 ); ) + + stbir__simdfX_store( output, o0 ); stbir__simdfX_store( output+stbir__simdfX_float_count, o1 ); stbir__simdfX_store( output+(2*stbir__simdfX_float_count), o2 ); stbir__simdfX_store( output+(3*stbir__simdfX_float_count), o3 ); + output += (4*stbir__simdfX_float_count); + stbIF0( input0 += (4*stbir__simdfX_float_count); ) stbIF1( input1 += (4*stbir__simdfX_float_count); ) stbIF2( input2 += (4*stbir__simdfX_float_count); ) stbIF3( input3 += (4*stbir__simdfX_float_count); ) stbIF4( input4 += (4*stbir__simdfX_float_count); ) stbIF5( input5 += (4*stbir__simdfX_float_count); ) stbIF6( input6 += (4*stbir__simdfX_float_count); ) stbIF7( input7 += (4*stbir__simdfX_float_count); ) + } + + STBIR_SIMD_NO_UNROLL_LOOP_START + while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) + { + stbir__simdf o0, r0; + STBIR_SIMD_NO_UNROLL(output); + + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( stbir__simdf_load( o0, output ); stbir__simdf_load( r0, input0 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); ) + #else + stbIF0( stbir__simdf_load( r0, input0 ); stbir__simdf_mult( o0, r0, stbir__if_simdf8_cast_to_simdf4( c0 ) ); ) + #endif + stbIF1( stbir__simdf_load( r0, input1 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c1 ) ); ) + stbIF2( stbir__simdf_load( r0, input2 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c2 ) ); ) + stbIF3( stbir__simdf_load( r0, input3 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c3 ) ); ) + stbIF4( stbir__simdf_load( r0, input4 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c4 ) ); ) + stbIF5( stbir__simdf_load( r0, input5 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c5 ) ); ) + stbIF6( stbir__simdf_load( r0, input6 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c6 ) ); ) + stbIF7( stbir__simdf_load( r0, input7 ); stbir__simdf_madd( o0, o0, r0, stbir__if_simdf8_cast_to_simdf4( c7 ) ); ) + + stbir__simdf_store( output, o0 ); + output += 4; + stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; ) + } + } + #else + STBIR_NO_UNROLL_LOOP_START + while ( ( (char*)input0_end - (char*) input0 ) >= 16 ) + { + float o0, o1, o2, o3; + STBIR_NO_UNROLL(output); + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( o0 = output[0] + input0[0] * c0s; o1 = output[1] + input0[1] * c0s; o2 = output[2] + input0[2] * c0s; o3 = output[3] + input0[3] * c0s; ) + #else + stbIF0( o0 = input0[0] * c0s; o1 = input0[1] * c0s; o2 = input0[2] * c0s; o3 = input0[3] * c0s; ) + #endif + stbIF1( o0 += input1[0] * c1s; o1 += input1[1] * c1s; o2 += input1[2] * c1s; o3 += input1[3] * c1s; ) + stbIF2( o0 += input2[0] * c2s; o1 += input2[1] * c2s; o2 += input2[2] * c2s; o3 += input2[3] * c2s; ) + stbIF3( o0 += input3[0] * c3s; o1 += input3[1] * c3s; o2 += input3[2] * c3s; o3 += input3[3] * c3s; ) + stbIF4( o0 += input4[0] * c4s; o1 += input4[1] * c4s; o2 += input4[2] * c4s; o3 += input4[3] * c4s; ) + stbIF5( o0 += input5[0] * c5s; o1 += input5[1] * c5s; o2 += input5[2] * c5s; o3 += input5[3] * c5s; ) + stbIF6( o0 += input6[0] * c6s; o1 += input6[1] * c6s; o2 += input6[2] * c6s; o3 += input6[3] * c6s; ) + stbIF7( o0 += input7[0] * c7s; o1 += input7[1] * c7s; o2 += input7[2] * c7s; o3 += input7[3] * c7s; ) + output[0] = o0; output[1] = o1; output[2] = o2; output[3] = o3; + output += 4; + stbIF0( input0 += 4; ) stbIF1( input1 += 4; ) stbIF2( input2 += 4; ) stbIF3( input3 += 4; ) stbIF4( input4 += 4; ) stbIF5( input5 += 4; ) stbIF6( input6 += 4; ) stbIF7( input7 += 4; ) + } + #endif + STBIR_NO_UNROLL_LOOP_START + while ( input0 < input0_end ) + { + float o0; + STBIR_NO_UNROLL(output); + #ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE + stbIF0( o0 = output[0] + input0[0] * c0s; ) + #else + stbIF0( o0 = input0[0] * c0s; ) + #endif + stbIF1( o0 += input1[0] * c1s; ) + stbIF2( o0 += input2[0] * c2s; ) + stbIF3( o0 += input3[0] * c3s; ) + stbIF4( o0 += input4[0] * c4s; ) + stbIF5( o0 += input5[0] * c5s; ) + stbIF6( o0 += input6[0] * c6s; ) + stbIF7( o0 += input7[0] * c7s; ) + output[0] = o0; + ++output; + stbIF0( ++input0; ) stbIF1( ++input1; ) stbIF2( ++input2; ) stbIF3( ++input3; ) stbIF4( ++input4; ) stbIF5( ++input5; ) stbIF6( ++input6; ) stbIF7( ++input7; ) + } +} + +#undef stbIF0 +#undef stbIF1 +#undef stbIF2 +#undef stbIF3 +#undef stbIF4 +#undef stbIF5 +#undef stbIF6 +#undef stbIF7 +#undef STB_IMAGE_RESIZE_DO_VERTICALS +#undef STBIR__vertical_channels +#undef STB_IMAGE_RESIZE_DO_HORIZONTALS +#undef STBIR_strs_join24 +#undef STBIR_strs_join14 +#undef STBIR_chans +#ifdef STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#undef STB_IMAGE_RESIZE_VERTICAL_CONTINUE +#endif + +#else // !STB_IMAGE_RESIZE_DO_VERTICALS + +#define STBIR_chans( start, end ) STBIR_strs_join1(start,STBIR__horizontal_channels,end) + +#ifndef stbir__2_coeff_only +#define stbir__2_coeff_only() \ + stbir__1_coeff_only(); \ + stbir__1_coeff_remnant(1); +#endif + +#ifndef stbir__2_coeff_remnant +#define stbir__2_coeff_remnant( ofs ) \ + stbir__1_coeff_remnant(ofs); \ + stbir__1_coeff_remnant((ofs)+1); +#endif + +#ifndef stbir__3_coeff_only +#define stbir__3_coeff_only() \ + stbir__2_coeff_only(); \ + stbir__1_coeff_remnant(2); +#endif + +#ifndef stbir__3_coeff_remnant +#define stbir__3_coeff_remnant( ofs ) \ + stbir__2_coeff_remnant(ofs); \ + stbir__1_coeff_remnant((ofs)+2); +#endif + +#ifndef stbir__3_coeff_setup +#define stbir__3_coeff_setup() +#endif + +#ifndef stbir__4_coeff_start +#define stbir__4_coeff_start() \ + stbir__2_coeff_only(); \ + stbir__2_coeff_remnant(2); +#endif + +#ifndef stbir__4_coeff_continue_from_4 +#define stbir__4_coeff_continue_from_4( ofs ) \ + stbir__2_coeff_remnant(ofs); \ + stbir__2_coeff_remnant((ofs)+2); +#endif + +#ifndef stbir__store_output_tiny +#define stbir__store_output_tiny stbir__store_output +#endif + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_1_coeff)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__1_coeff_only(); + stbir__store_output_tiny(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_2_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__2_coeff_only(); + stbir__store_output_tiny(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_3_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__3_coeff_only(); + stbir__store_output_tiny(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_4_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_5_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__1_coeff_remnant(4); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_6_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__2_coeff_remnant(4); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_7_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + stbir__3_coeff_setup(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + + stbir__4_coeff_start(); + stbir__3_coeff_remnant(4); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_8_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__4_coeff_continue_from_4(4); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_9_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__4_coeff_continue_from_4(4); + stbir__1_coeff_remnant(8); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_10_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__4_coeff_continue_from_4(4); + stbir__2_coeff_remnant(8); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_11_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + stbir__3_coeff_setup(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__4_coeff_continue_from_4(4); + stbir__3_coeff_remnant(8); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_12_coeffs)( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + float const * hc = horizontal_coefficients; + stbir__4_coeff_start(); + stbir__4_coeff_continue_from_4(4); + stbir__4_coeff_continue_from_4(8); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod0 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 4 + 3 ) >> 2; + float const * hc = horizontal_coefficients; + + stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + hc += 4; + decode += STBIR__horizontal_channels * 4; + stbir__4_coeff_continue_from_4( 0 ); + --n; + } while ( n > 0 ); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod1 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 5 + 3 ) >> 2; + float const * hc = horizontal_coefficients; + + stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + hc += 4; + decode += STBIR__horizontal_channels * 4; + stbir__4_coeff_continue_from_4( 0 ); + --n; + } while ( n > 0 ); + stbir__1_coeff_remnant( 4 ); + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod2 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 6 + 3 ) >> 2; + float const * hc = horizontal_coefficients; + + stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + hc += 4; + decode += STBIR__horizontal_channels * 4; + stbir__4_coeff_continue_from_4( 0 ); + --n; + } while ( n > 0 ); + stbir__2_coeff_remnant( 4 ); + + stbir__store_output(); + } while ( output < output_end ); +} + +static void STBIR_chans( stbir__horizontal_gather_,_channels_with_n_coeffs_mod3 )( float * output_buffer, unsigned int output_sub_size, float const * decode_buffer, stbir__contributors const * horizontal_contributors, float const * horizontal_coefficients, int coefficient_width ) +{ + float const * output_end = output_buffer + output_sub_size * STBIR__horizontal_channels; + float STBIR_SIMD_STREAMOUT_PTR( * ) output = output_buffer; + stbir__3_coeff_setup(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + float const * decode = decode_buffer + horizontal_contributors->n0 * STBIR__horizontal_channels; + int n = ( ( horizontal_contributors->n1 - horizontal_contributors->n0 + 1 ) - 7 + 3 ) >> 2; + float const * hc = horizontal_coefficients; + + stbir__4_coeff_start(); + STBIR_SIMD_NO_UNROLL_LOOP_START + do { + hc += 4; + decode += STBIR__horizontal_channels * 4; + stbir__4_coeff_continue_from_4( 0 ); + --n; + } while ( n > 0 ); + stbir__3_coeff_remnant( 4 ); + + stbir__store_output(); + } while ( output < output_end ); +} + +static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_funcs)[4]= +{ + STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod0), + STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod1), + STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod2), + STBIR_chans(stbir__horizontal_gather_,_channels_with_n_coeffs_mod3), +}; + +static stbir__horizontal_gather_channels_func * STBIR_chans(stbir__horizontal_gather_,_channels_funcs)[12]= +{ + STBIR_chans(stbir__horizontal_gather_,_channels_with_1_coeff), + STBIR_chans(stbir__horizontal_gather_,_channels_with_2_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_3_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_4_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_5_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_6_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_7_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_8_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_9_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_10_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_11_coeffs), + STBIR_chans(stbir__horizontal_gather_,_channels_with_12_coeffs), +}; + +#undef STBIR__horizontal_channels +#undef STB_IMAGE_RESIZE_DO_HORIZONTALS +#undef stbir__1_coeff_only +#undef stbir__1_coeff_remnant +#undef stbir__2_coeff_only +#undef stbir__2_coeff_remnant +#undef stbir__3_coeff_only +#undef stbir__3_coeff_remnant +#undef stbir__3_coeff_setup +#undef stbir__4_coeff_start +#undef stbir__4_coeff_continue_from_4 +#undef stbir__store_output +#undef stbir__store_output_tiny +#undef STBIR_chans + +#endif // HORIZONALS + +#undef STBIR_strs_join2 +#undef STBIR_strs_join1 + +#endif // STB_IMAGE_RESIZE_DO_HORIZONTALS/VERTICALS/CODERS + +/* +------------------------------------------------------------------------------ +This software is available under 2 licenses -- choose whichever you prefer. +------------------------------------------------------------------------------ +ALTERNATIVE A - MIT License +Copyright (c) 2017 Sean Barrett +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +------------------------------------------------------------------------------ +ALTERNATIVE B - Public Domain (www.unlicense.org) +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non-commercial, and by any means. +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain. We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors. We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +------------------------------------------------------------------------------ +*/ diff --git a/third_party/stb/stb_image_write.h b/third_party/stb/stb_image_write.h index 95943eb..e4b32ed 100644 --- a/third_party/stb/stb_image_write.h +++ b/third_party/stb/stb_image_write.h @@ -1,4 +1,4 @@ -/* stb_image_write - v1.15 - public domain - http://nothings.org/stb +/* stb_image_write - v1.16 - public domain - http://nothings.org/stb writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015 no warranty implied; use at your own risk @@ -140,6 +140,7 @@ Ivan Tikhonov github:ignotion Adam Schackart + Andrew Kensler LICENSE @@ -166,9 +167,9 @@ LICENSE #endif #ifndef STB_IMAGE_WRITE_STATIC // C++ forbids static forward declarations -extern int stbi_write_tga_with_rle; -extern int stbi_write_png_compression_level; -extern int stbi_write_force_png_filter; +STBIWDEF int stbi_write_tga_with_rle; +STBIWDEF int stbi_write_png_compression_level; +STBIWDEF int stbi_write_force_png_filter; #endif #ifndef STBI_WRITE_NO_STDIO @@ -178,7 +179,7 @@ STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality); -#ifdef STBI_WINDOWS_UTF8 +#ifdef STBIW_WINDOWS_UTF8 STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input); #endif #endif @@ -285,7 +286,7 @@ static void stbi__stdio_write(void *context, void *data, int size) fwrite(data,1,size,(FILE*) context); } -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8) #ifdef __cplusplus #define STBIW_EXTERN extern "C" #else @@ -296,25 +297,25 @@ STBIW_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned in STBIWDEF int stbiw_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input) { - return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); + return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL); } #endif static FILE *stbiw__fopen(char const *filename, char const *mode) { FILE *f; -#if defined(_MSC_VER) && defined(STBI_WINDOWS_UTF8) +#if defined(_WIN32) && defined(STBIW_WINDOWS_UTF8) wchar_t wMode[64]; wchar_t wFilename[1024]; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename))) return 0; - if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode))) + if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode))) return 0; -#if _MSC_VER >= 1400 - if (0 != _wfopen_s(&f, wFilename, wMode)) - f = 0; +#if defined(_MSC_VER) && _MSC_VER >= 1400 + if (0 != _wfopen_s(&f, wFilename, wMode)) + f = 0; #else f = _wfopen(wFilename, wMode); #endif @@ -397,7 +398,7 @@ static void stbiw__putc(stbi__write_context *s, unsigned char c) static void stbiw__write1(stbi__write_context *s, unsigned char a) { - if (s->buf_used + 1 > sizeof(s->buffer)) + if ((size_t)s->buf_used + 1 > sizeof(s->buffer)) stbiw__write_flush(s); s->buffer[s->buf_used++] = a; } @@ -405,7 +406,7 @@ static void stbiw__write1(stbi__write_context *s, unsigned char a) static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c) { int n; - if (s->buf_used + 3 > sizeof(s->buffer)) + if ((size_t)s->buf_used + 3 > sizeof(s->buffer)) stbiw__write_flush(s); n = s->buf_used; s->buf_used = n+3; @@ -490,11 +491,22 @@ static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data) { - int pad = (-x*3) & 3; - return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad, - "11 4 22 4" "4 44 22 444444", - 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header - 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header + if (comp != 4) { + // write RGB bitmap + int pad = (-x*3) & 3; + return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad, + "11 4 22 4" "4 44 22 444444", + 'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40, // file header + 40, x,y, 1,24, 0,0,0,0,0,0); // bitmap header + } else { + // RGBA bitmaps need a v4 header + // use BI_BITFIELDS mode with 32bpp and alpha mask + // (straight BI_RGB with alpha mask doesn't work in most readers) + return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *)data,1,0, + "11 4 22 4" "4 44 22 444444 4444 4 444 444 444 444", + 'B', 'M', 14+108+x*y*4, 0, 0, 14+108, // file header + 108, x,y, 1,32, 3,0,0,0,0,0, 0xff0000,0xff00,0xff,0xff000000u, 0, 0,0,0, 0,0,0, 0,0,0, 0,0,0); // bitmap V4 header + } } STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data) @@ -622,6 +634,8 @@ STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const #define stbiw__max(a, b) ((a) > (b) ? (a) : (b)) +#ifndef STBI_WRITE_NO_STDIO + static void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear) { int exponent; @@ -756,7 +770,7 @@ static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, f char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"; s->func(s->context, header, sizeof(header)-1); -#ifdef __STDC_WANT_SECURE_LIB__ +#ifdef __STDC_LIB_EXT1__ len = sprintf_s(buffer, sizeof(buffer), "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); #else len = sprintf(buffer, "EXPOSURE= 1.0000000000000\n\n-Y %d +X %d\n", y, x); @@ -777,7 +791,6 @@ STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, return stbi_write_hdr_core(&s, x, y, comp, (float *) data); } -#ifndef STBI_WRITE_NO_STDIO STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data) { stbi__write_context s = { 0 }; @@ -968,6 +981,23 @@ STBIWDEF unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, i (void) stbiw__sbfree(hash_table[i]); STBIW_FREE(hash_table); + // store uncompressed instead if compression was worse + if (stbiw__sbn(out) > data_len + 2 + ((data_len+32766)/32767)*5) { + stbiw__sbn(out) = 2; // truncate to DEFLATE 32K window and FLEVEL = 1 + for (j = 0; j < data_len;) { + int blocklen = data_len - j; + if (blocklen > 32767) blocklen = 32767; + stbiw__sbpush(out, data_len - j == blocklen); // BFINAL = ?, BTYPE = 0 -- no compression + stbiw__sbpush(out, STBIW_UCHAR(blocklen)); // LEN + stbiw__sbpush(out, STBIW_UCHAR(blocklen >> 8)); + stbiw__sbpush(out, STBIW_UCHAR(~blocklen)); // NLEN + stbiw__sbpush(out, STBIW_UCHAR(~blocklen >> 8)); + memcpy(out+stbiw__sbn(out), data+j, blocklen); + stbiw__sbn(out) += blocklen; + j += blocklen; + } + } + { // compute adler32 on input unsigned int s1=1, s2=0; @@ -1598,6 +1628,10 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const #endif // STB_IMAGE_WRITE_IMPLEMENTATION /* Revision history + 1.16 (2021-07-11) + make Deflate code emit uncompressed blocks when it would otherwise expand + support writing BMPs with alpha channel + 1.15 (2020-07-13) unknown 1.14 (2020-02-02) updated JPEG writer to downsample chroma channels 1.13 1.12 @@ -1635,7 +1669,7 @@ STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const add HDR output fix monochrome BMP 0.95 (2014-08-17) - add monochrome TGA output + add monochrome TGA output 0.94 (2014-05-31) rename private functions to avoid conflicts with stb_image.h 0.93 (2014-05-27) diff --git a/third_party/stb/stb_rect_pack.h b/third_party/stb/stb_rect_pack.h index 5c848de..6a633ce 100644 --- a/third_party/stb/stb_rect_pack.h +++ b/third_party/stb/stb_rect_pack.h @@ -1,9 +1,15 @@ -// stb_rect_pack.h - v1.00 - public domain - rectangle packing +// stb_rect_pack.h - v1.01 - public domain - rectangle packing // Sean Barrett 2014 // // Useful for e.g. packing rectangular textures into an atlas. // Does not do rotation. // +// Before #including, +// +// #define STB_RECT_PACK_IMPLEMENTATION +// +// in the file that you want to have the implementation. +// // Not necessarily the awesomest packing method, but better than // the totally naive one in stb_truetype (which is primarily what // this is meant to replace). @@ -35,6 +41,7 @@ // // Version history: // +// 1.01 (2021-07-11) always use large rect mode, expose STBRP__MAXVAL in public section // 1.00 (2019-02-25) avoid small space waste; gracefully fail too-wide rectangles // 0.99 (2019-02-07) warning fixes // 0.11 (2017-03-03) return packing success/fail result @@ -75,11 +82,10 @@ typedef struct stbrp_context stbrp_context; typedef struct stbrp_node stbrp_node; typedef struct stbrp_rect stbrp_rect; -#ifdef STBRP_LARGE_RECTS typedef int stbrp_coord; -#else -typedef unsigned short stbrp_coord; -#endif + +#define STBRP__MAXVAL 0x7fffffff +// Mostly for internal use, but this is the maximum supported coordinate value. STBRP_DEF int stbrp_pack_rects (stbrp_context *context, stbrp_rect *rects, int num_rects); // Assign packed locations to rectangles. The rectangles are of type @@ -209,8 +215,10 @@ struct stbrp_context #ifdef _MSC_VER #define STBRP__NOTUSED(v) (void)(v) +#define STBRP__CDECL __cdecl #else #define STBRP__NOTUSED(v) (void)sizeof(v) +#define STBRP__CDECL #endif enum @@ -253,9 +261,6 @@ STBRP_DEF void stbrp_setup_allow_out_of_mem(stbrp_context *context, int allow_ou STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, stbrp_node *nodes, int num_nodes) { int i; -#ifndef STBRP_LARGE_RECTS - STBRP_ASSERT(width <= 0xffff && height <= 0xffff); -#endif for (i=0; i < num_nodes-1; ++i) nodes[i].next = &nodes[i+1]; @@ -274,11 +279,7 @@ STBRP_DEF void stbrp_init_target(stbrp_context *context, int width, int height, context->extra[0].y = 0; context->extra[0].next = &context->extra[1]; context->extra[1].x = (stbrp_coord) width; -#ifdef STBRP_LARGE_RECTS context->extra[1].y = (1<<30); -#else - context->extra[1].y = 65535; -#endif context->extra[1].next = NULL; } @@ -520,7 +521,7 @@ static stbrp__findresult stbrp__skyline_pack_rectangle(stbrp_context *context, i return res; } -static int rect_height_compare(const void *a, const void *b) +static int STBRP__CDECL rect_height_compare(const void *a, const void *b) { const stbrp_rect *p = (const stbrp_rect *) a; const stbrp_rect *q = (const stbrp_rect *) b; @@ -531,19 +532,13 @@ static int rect_height_compare(const void *a, const void *b) return (p->w > q->w) ? -1 : (p->w < q->w); } -static int rect_original_order(const void *a, const void *b) +static int STBRP__CDECL rect_original_order(const void *a, const void *b) { const stbrp_rect *p = (const stbrp_rect *) a; const stbrp_rect *q = (const stbrp_rect *) b; return (p->was_packed < q->was_packed) ? -1 : (p->was_packed > q->was_packed); } -#ifdef STBRP_LARGE_RECTS -#define STBRP__MAXVAL 0xffffffff -#else -#define STBRP__MAXVAL 0xffff -#endif - STBRP_DEF int stbrp_pack_rects(stbrp_context *context, stbrp_rect *rects, int num_rects) { int i, all_rects_packed = 1; diff --git a/third_party/stb/stb_sprintf.h b/third_party/stb/stb_sprintf.h index 0635360..ca432a6 100644 --- a/third_party/stb/stb_sprintf.h +++ b/third_party/stb/stb_sprintf.h @@ -1,4 +1,4 @@ -// stb_sprintf - v1.09 - public domain snprintf() implementation +// stb_sprintf - v1.10 - public domain snprintf() implementation // originally by Jeff Roberts / RAD Game Tools, 2015/10/20 // http://github.com/nothings/stb // @@ -7,6 +7,7 @@ // // Contributors: // Fabian "ryg" Giesen (reformatting) +// github:aganm (attribute format) // // Contributors (bugfixes): // github:d26435 @@ -153,8 +154,8 @@ PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC): #endif #endif #endif -#elif __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) - #if __SANITIZE_ADDRESS__ +#elif defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) + #if defined(__SANITIZE_ADDRESS__) && __SANITIZE_ADDRESS__ #define STBSP__ASAN __attribute__((__no_sanitize_address__)) #endif #endif @@ -176,7 +177,23 @@ PERFORMANCE vs MSVC 2008 32-/64-bit (GCC is even slower than MSVC): #endif #endif -#include // for va_list() +#if defined(__has_attribute) + #if __has_attribute(format) + #define STBSP__ATTRIBUTE_FORMAT(fmt,va) __attribute__((format(printf,fmt,va))) + #endif +#endif + +#ifndef STBSP__ATTRIBUTE_FORMAT +#define STBSP__ATTRIBUTE_FORMAT(fmt,va) +#endif + +#ifdef _MSC_VER +#define STBSP__NOTUSED(v) (void)(v) +#else +#define STBSP__NOTUSED(v) (void)sizeof(v) +#endif + +#include // for va_arg(), va_list() #include // size_t, ptrdiff_t #ifndef STB_SPRINTF_MIN @@ -188,20 +205,18 @@ typedef char *STBSP_SPRINTFCB(const char *buf, void *user, int len); #define STB_SPRINTF_DECORATE(name) stbsp_##name // define this before including if you want to change the names #endif -STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va); -STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va); -STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...); -STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...); +STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintf)(char *buf, char const *fmt, va_list va); +STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsnprintf)(char *buf, int count, char const *fmt, va_list va); +STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(sprintf)(char *buf, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(2,3); +STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(snprintf)(char *buf, int count, char const *fmt, ...) STBSP__ATTRIBUTE_FORMAT(3,4); -STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va); -STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char period); +STBSP__PUBLICDEC int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va); +STBSP__PUBLICDEC void STB_SPRINTF_DECORATE(set_separators)(char comma, char period); #endif // STB_SPRINTF_H_INCLUDE #ifdef STB_SPRINTF_IMPLEMENTATION -#include // for va_arg() - #define stbsp__uint32 unsigned int #define stbsp__int32 signed int @@ -215,7 +230,7 @@ STBSP__PUBLICDEF void STB_SPRINTF_DECORATE(set_separators)(char comma, char peri #define stbsp__uint16 unsigned short #ifndef stbsp__uintptr -#if defined(__ppc64__) || defined(__powerpc64__) || defined(__aarch64__) || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64) +#if defined(__ppc64__) || defined(__powerpc64__) || defined(__aarch64__) || defined(_M_X64) || defined(__x86_64__) || defined(__x86_64) || defined(__s390x__) #define stbsp__uintptr stbsp__uint64 #else #define stbsp__uintptr stbsp__uint32 @@ -291,6 +306,46 @@ static void stbsp__lead_sign(stbsp__uint32 fl, char *sign) } } +static STBSP__ASAN stbsp__uint32 stbsp__strlen_limited(char const *s, stbsp__uint32 limit) +{ + char const * sn = s; + + // get up to 4-byte alignment + for (;;) { + if (((stbsp__uintptr)sn & 3) == 0) + break; + + if (!limit || *sn == 0) + return (stbsp__uint32)(sn - s); + + ++sn; + --limit; + } + + // scan over 4 bytes at a time to find terminating 0 + // this will intentionally scan up to 3 bytes past the end of buffers, + // but becase it works 4B aligned, it will never cross page boundaries + // (hence the STBSP__ASAN markup; the over-read here is intentional + // and harmless) + while (limit >= 4) { + stbsp__uint32 v = *(stbsp__uint32 *)sn; + // bit hack to find if there's a 0 byte in there + if ((v - 0x01010101) & (~v) & 0x80808080UL) + break; + + sn += 4; + limit -= 4; + } + + // handle the last few characters to find actual size + while (limit && *sn) { + ++sn; + --limit; + } + + return (stbsp__uint32)(sn - s); +} + STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, void *user, char *buf, char const *fmt, va_list va) { static char hex[] = "0123456789abcdefxp"; @@ -534,37 +589,9 @@ STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, s = va_arg(va, char *); if (s == 0) s = (char *)"null"; - // get the length - sn = s; - for (;;) { - if ((((stbsp__uintptr)sn) & 3) == 0) - break; - lchk: - if (sn[0] == 0) - goto ld; - ++sn; - } - n = 0xffffffff; - if (pr >= 0) { - n = (stbsp__uint32)(sn - s); - if (n >= (stbsp__uint32)pr) - goto ld; - n = ((stbsp__uint32)(pr - n)) >> 2; - } - while (n) { - stbsp__uint32 v = *(stbsp__uint32 *)sn; - if ((v - 0x01010101) & (~v) & 0x80808080UL) - goto lchk; - sn += 4; - --n; - } - goto lchk; - ld: - - l = (stbsp__uint32)(sn - s); - // clamp to precision - if (l > (stbsp__uint32)pr) - l = pr; + // get the length, limited to desired precision + // always limit to ~0u chars since our counts are 32b + l = stbsp__strlen_limited(s, (pr >= 0) ? pr : ~0u); lead[0] = 0; tail[0] = 0; pr = 0; @@ -605,8 +632,8 @@ STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, lead[0] = 0; tail[0] = 0; pr = 0; - dp = 0; cs = 0; + STBSP__NOTUSED(dp); goto scopy; #else case 'A': // hex float @@ -1001,7 +1028,7 @@ STBSP__PUBLICDEF int STB_SPRINTF_DECORATE(vsprintfcb)(STBSP_SPRINTFCB *callback, lead[0] = 0; if (pr == 0) { l = 0; - cs = (((l >> 4) & 15)) << 24; + cs = 0; goto scopy; } } @@ -1588,7 +1615,7 @@ static stbsp__uint64 const stbsp__powten[20] = { #define stbsp__ddtoS64(ob, xh, xl) \ { \ double ahi = 0, alo, vh, t; \ - ob = (stbsp__int64)ph; \ + ob = (stbsp__int64)xh; \ vh = (double)ob; \ ahi = (xh - vh); \ t = (ahi - xh); \ diff --git a/third_party/stb/stb_textedit.h b/third_party/stb/stb_textedit.h index cd38a25..1442493 100644 --- a/third_party/stb/stb_textedit.h +++ b/third_party/stb/stb_textedit.h @@ -1,4 +1,4 @@ -// stb_textedit.h - v1.13 - public domain - Sean Barrett +// stb_textedit.h - v1.14 - public domain - Sean Barrett // Development of this library was sponsored by RAD Game Tools // // This C header file implements the guts of a multi-line text-editing @@ -29,6 +29,7 @@ // // VERSION HISTORY // +// 1.14 (2021-07-11) page up/down, various fixes // 1.13 (2019-02-07) fix bug in undo size management // 1.12 (2018-01-29) user can change STB_TEXTEDIT_KEYTYPE, fix redo to avoid crash // 1.11 (2017-03-03) fix HOME on last line, dragging off single-line textfield @@ -52,6 +53,7 @@ // Ulf Winklemann: move-by-word in 1.1 // Fabian Giesen: secondary key inputs in 1.5 // Martins Mozeiko: STB_TEXTEDIT_memmove in 1.6 +// Louis Schnellbach: page up/down in 1.14 // // Bugfixes: // Scott Graham @@ -87,8 +89,8 @@ // moderate sizes. The undo system does no memory allocations, so // it grows STB_TexteditState by the worst-case storage which is (in bytes): // -// [4 + 3 * sizeof(STB_TEXTEDIT_POSITIONTYPE)] * STB_TEXTEDIT_UNDOSTATE_COUNT -// + sizeof(STB_TEXTEDIT_CHARTYPE) * STB_TEXTEDIT_UNDOCHAR_COUNT +// [4 + 3 * sizeof(STB_TEXTEDIT_POSITIONTYPE)] * STB_TEXTEDIT_UNDOSTATECOUNT +// + sizeof(STB_TEXTEDIT_CHARTYPE) * STB_TEXTEDIT_UNDOCHARCOUNT // // // Implementation mode: @@ -142,6 +144,8 @@ // STB_TEXTEDIT_K_RIGHT keyboard input to move cursor right // STB_TEXTEDIT_K_UP keyboard input to move cursor up // STB_TEXTEDIT_K_DOWN keyboard input to move cursor down +// STB_TEXTEDIT_K_PGUP keyboard input to move cursor up a page +// STB_TEXTEDIT_K_PGDOWN keyboard input to move cursor down a page // STB_TEXTEDIT_K_LINESTART keyboard input to move cursor to start of line // e.g. HOME // STB_TEXTEDIT_K_LINEEND keyboard input to move cursor to end of line // e.g. END // STB_TEXTEDIT_K_TEXTSTART keyboard input to move cursor to start of text // e.g. ctrl-HOME @@ -164,10 +168,6 @@ // STB_TEXTEDIT_K_TEXTSTART2 secondary keyboard input to move cursor to start of text // STB_TEXTEDIT_K_TEXTEND2 secondary keyboard input to move cursor to end of text // -// Todo: -// STB_TEXTEDIT_K_PGUP keyboard input to move cursor up a page -// STB_TEXTEDIT_K_PGDOWN keyboard input to move cursor down a page -// // Keyboard input must be encoded as a single integer value; e.g. a character code // and some bitflags that represent shift states. to simplify the interface, SHIFT must // be a bitflag, so we can test the shifted state of cursor movements to allow selection, @@ -331,6 +331,10 @@ typedef struct // each textfield keeps its own insert mode state. to keep an app-wide // insert mode, copy this value in/out of the app state + int row_count_per_page; + // page size in number of row. + // this value MUST be set to >0 for pageup or pagedown in multilines documents. + ///////////////////// // // private data @@ -708,9 +712,7 @@ static int stb_textedit_paste_internal(STB_TEXTEDIT_STRING *str, STB_TexteditSta state->has_preferred_x = 0; return 1; } - // remove the undo since we didn't actually insert the characters - if (state->undostate.undo_point) - --state->undostate.undo_point; + // note: paste failure will leave deleted selection, may be restored with an undo (see https://github.com/nothings/stb/issues/734 for details) return 0; } @@ -849,12 +851,16 @@ static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, break; case STB_TEXTEDIT_K_DOWN: - case STB_TEXTEDIT_K_DOWN | STB_TEXTEDIT_K_SHIFT: { + case STB_TEXTEDIT_K_DOWN | STB_TEXTEDIT_K_SHIFT: + case STB_TEXTEDIT_K_PGDOWN: + case STB_TEXTEDIT_K_PGDOWN | STB_TEXTEDIT_K_SHIFT: { StbFindState find; StbTexteditRow row; - int i, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0; + int i, j, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0; + int is_page = (key & ~STB_TEXTEDIT_K_SHIFT) == STB_TEXTEDIT_K_PGDOWN; + int row_count = is_page ? state->row_count_per_page : 1; - if (state->single_line) { + if (!is_page && state->single_line) { // on windows, up&down in single-line behave like left&right key = STB_TEXTEDIT_K_RIGHT | (key & STB_TEXTEDIT_K_SHIFT); goto retry; @@ -863,17 +869,20 @@ static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, if (sel) stb_textedit_prep_selection_at_cursor(state); else if (STB_TEXT_HAS_SELECTION(state)) - stb_textedit_move_to_last(str,state); + stb_textedit_move_to_last(str, state); // compute current position of cursor point stb_textedit_clamp(str, state); stb_textedit_find_charpos(&find, str, state->cursor, state->single_line); - // now find character position down a row - if (find.length) { - float goal_x = state->has_preferred_x ? state->preferred_x : find.x; - float x; + for (j = 0; j < row_count; ++j) { + float x, goal_x = state->has_preferred_x ? state->preferred_x : find.x; int start = find.first_char + find.length; + + if (find.length == 0) + break; + + // now find character position down a row state->cursor = start; STB_TEXTEDIT_LAYOUTROW(&row, str, state->cursor); x = row.x0; @@ -895,17 +904,25 @@ static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, if (sel) state->select_end = state->cursor; + + // go to next line + find.first_char = find.first_char + find.length; + find.length = row.num_chars; } break; } case STB_TEXTEDIT_K_UP: - case STB_TEXTEDIT_K_UP | STB_TEXTEDIT_K_SHIFT: { + case STB_TEXTEDIT_K_UP | STB_TEXTEDIT_K_SHIFT: + case STB_TEXTEDIT_K_PGUP: + case STB_TEXTEDIT_K_PGUP | STB_TEXTEDIT_K_SHIFT: { StbFindState find; StbTexteditRow row; - int i, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0; + int i, j, prev_scan, sel = (key & STB_TEXTEDIT_K_SHIFT) != 0; + int is_page = (key & ~STB_TEXTEDIT_K_SHIFT) == STB_TEXTEDIT_K_PGUP; + int row_count = is_page ? state->row_count_per_page : 1; - if (state->single_line) { + if (!is_page && state->single_line) { // on windows, up&down become left&right key = STB_TEXTEDIT_K_LEFT | (key & STB_TEXTEDIT_K_SHIFT); goto retry; @@ -920,11 +937,14 @@ static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, stb_textedit_clamp(str, state); stb_textedit_find_charpos(&find, str, state->cursor, state->single_line); - // can only go up if there's a previous row - if (find.prev_first != find.first_char) { + for (j = 0; j < row_count; ++j) { + float x, goal_x = state->has_preferred_x ? state->preferred_x : find.x; + + // can only go up if there's a previous row + if (find.prev_first == find.first_char) + break; + // now find character position up a row - float goal_x = state->has_preferred_x ? state->preferred_x : find.x; - float x; state->cursor = find.prev_first; STB_TEXTEDIT_LAYOUTROW(&row, str, state->cursor); x = row.x0; @@ -946,6 +966,14 @@ static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, if (sel) state->select_end = state->cursor; + + // go to previous line + // (we need to scan previous line the hard way. maybe we could expose this as a new API function?) + prev_scan = find.prev_first > 0 ? find.prev_first - 1 : 0; + while (prev_scan > 0 && STB_TEXTEDIT_GETCHAR(str, prev_scan - 1) != STB_TEXTEDIT_NEWLINE) + --prev_scan; + find.first_char = find.prev_first; + find.prev_first = prev_scan; } break; } @@ -1069,10 +1097,6 @@ static void stb_textedit_key(STB_TEXTEDIT_STRING *str, STB_TexteditState *state, state->has_preferred_x = 0; break; } - -// @TODO: -// STB_TEXTEDIT_K_PGUP - move cursor up a page -// STB_TEXTEDIT_K_PGDOWN - move cursor down a page } } @@ -1337,6 +1361,7 @@ static void stb_textedit_clear_state(STB_TexteditState *state, int is_single_lin state->initialized = 1; state->single_line = (unsigned char) is_single_line; state->insert_mode = 0; + state->row_count_per_page = 0; } // API initialize diff --git a/third_party/stb/stb_tilemap_editor.h b/third_party/stb/stb_tilemap_editor.h index 9b77364..fbd3388 100644 --- a/third_party/stb/stb_tilemap_editor.h +++ b/third_party/stb/stb_tilemap_editor.h @@ -1,4 +1,4 @@ -// stb_tilemap_editor.h - v0.41 - Sean Barrett - http://nothings.org/stb +// stb_tilemap_editor.h - v0.42 - Sean Barrett - http://nothings.org/stb // placed in the public domain - not copyrighted - first released 2014-09 // // Embeddable tilemap editor for C/C++ @@ -275,6 +275,7 @@ // either approach allows cut&pasting between levels.) // // REVISION HISTORY +// 0.42 fix compilation errors // 0.41 fix warnings // 0.40 fix warning // 0.39 fix warning @@ -317,6 +318,8 @@ // Bugfixes: // Ryan Whitworth // Eugene Opalev +// Rob Loach +// github:wernsey // // LICENSE // @@ -1821,6 +1824,8 @@ static int stbte__button(int colormode, const char *label, int x, int y, int tex int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT; int s = STBTE__BUTTON_INTERNAL_SPACING; + if(!disabled) stbte__hittest(x0,y0,x1,y1,id); + if (stbte__ui.event == STBTE__paint) stbte__draw_textbox(x0,y0,x1,y1, (char*) label,s+textoff,s, colormode, STBTE__INDEX_FOR_ID(id,disabled,toggled)); if (disabled) @@ -1833,6 +1838,8 @@ static int stbte__button_icon(int colormode, char ch, int x, int y, int width, i int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT; int s = STBTE__BUTTON_INTERNAL_SPACING; + stbte__hittest(x0,y0,x1,y1,id); + if (stbte__ui.event == STBTE__paint) { char label[2] = { ch, 0 }; int pad = (9 - stbte__get_char_width(ch))/2; @@ -1846,6 +1853,7 @@ static int stbte__button_icon(int colormode, char ch, int x, int y, int width, i static int stbte__minibutton(int colormode, int x, int y, int ch, int id) { int x0 = x, y0 = y, x1 = x+8, y1 = y+7; + stbte__hittest(x0,y0,x1,y1,id); if (stbte__ui.event == STBTE__paint) { char str[2] = { (char)ch, 0 }; stbte__draw_textbox(x0,y0,x1,y1, str,1,0,colormode, STBTE__INDEX_FOR_ID(id,0,0)); @@ -1856,6 +1864,7 @@ static int stbte__minibutton(int colormode, int x, int y, int ch, int id) static int stbte__layerbutton(int x, int y, int ch, int id, int toggled, int disabled, int colormode) { int x0 = x, y0 = y, x1 = x+10, y1 = y+11; + if(!disabled) stbte__hittest(x0,y0,x1,y1,id); if (stbte__ui.event == STBTE__paint) { char str[2] = { (char)ch, 0 }; int off = (9-stbte__get_char_width(ch))/2; @@ -1869,6 +1878,7 @@ static int stbte__layerbutton(int x, int y, int ch, int id, int toggled, int dis static int stbte__microbutton(int x, int y, int size, int id, int colormode) { int x0 = x, y0 = y, x1 = x+size, y1 = y+size; + stbte__hittest(x0,y0,x1,y1,id); if (stbte__ui.event == STBTE__paint) { stbte__draw_box(x0,y0,x1,y1, colormode, STBTE__INDEX_FOR_ID(id,0,0)); } @@ -1878,6 +1888,7 @@ static int stbte__microbutton(int x, int y, int size, int id, int colormode) static int stbte__microbutton_dragger(int x, int y, int size, int id, int *pos) { int x0 = x, y0 = y, x1 = x+size, y1 = y+size; + stbte__hittest(x0,y0,x1,y1,id); switch (stbte__ui.event) { case STBTE__paint: stbte__draw_box(x0,y0,x1,y1, STBTE__cexpander, STBTE__INDEX_FOR_ID(id,0,0)); @@ -1908,6 +1919,8 @@ static int stbte__category_button(const char *label, int x, int y, int width, in int x0=x,y0=y, x1=x+width,y1=y+STBTE__BUTTON_HEIGHT; int s = STBTE__BUTTON_INTERNAL_SPACING; + stbte__hittest(x0,y0,x1,y1,id); + if (stbte__ui.event == STBTE__paint) stbte__draw_textbox(x0,y0,x1,y1, (char*) label, s,s, STBTE__ccategory_button, STBTE__INDEX_FOR_ID(id,0,toggled)); @@ -1927,6 +1940,7 @@ static int stbte__slider(int x0, int w, int y, int range, int *value, int id) { int x1 = x0+w; int pos = *value * w / (range+1); + stbte__hittest(x0,y-2,x1,y+3,id); int event_mouse_move = STBTE__change; switch (stbte__ui.event) { case STBTE__paint: @@ -1969,6 +1983,7 @@ static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, f { int x1 = x0+w; int y1 = y0+11; + stbte__hittest(x0,y0,x1,y1,id); switch (stbte__ui.event) { case STBTE__paint: { char text[32]; @@ -1980,7 +1995,7 @@ static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, f case STBTE__rightdown: if (STBTE__IS_HOT(id) && STBTE__INACTIVE()) stbte__activate(id); - return STBTE__begin; + return STBTE__begin; break; case STBTE__leftup: case STBTE__rightup: @@ -2020,7 +2035,6 @@ static int stbte__float_control(int x0, int y0, int w, float minv, float maxv, f static void stbte__scrollbar(int x, int y0, int y1, int *val, int v0, int v1, int num_vis, int id) { - int over; int thumbpos; if (v1 - v0 <= num_vis) return; @@ -2029,7 +2043,7 @@ static void stbte__scrollbar(int x, int y0, int y1, int *val, int v0, int v1, in thumbpos = y0+2 + (y1-y0-4) * *val / (v1 - v0 - num_vis); if (thumbpos < y0) thumbpos = y0; if (thumbpos >= y1) thumbpos = y1; - over = stbte__hittest(x-1,y0,x+2,y1,id); + stbte__hittest(x-1,y0,x+2,y1,id); switch (stbte__ui.event) { case STBTE__paint: stbte__draw_rect(x,y0,x+1,y1, stbte__color_table[STBTE__cscrollbar][STBTE__text][STBTE__idle]); @@ -2807,6 +2821,10 @@ static void stbte__drag_update(stbte_tilemap *tm, int mapx, int mapy, int copy_p int ox,oy,i,deleted=0,written=0; short temp[STBTE_MAX_LAYERS]; short *data = NULL; + + STBTE__NOTUSED(deleted); + STBTE__NOTUSED(written); + if (!stbte__ui.shift) { ox = mapx - stbte__ui.drag_x; oy = mapy - stbte__ui.drag_y; @@ -2928,6 +2946,9 @@ static void stbte__tile_paint(stbte_tilemap *tm, int sx, int sy, int mapx, int m { int i; int id = STBTE__IDMAP(mapx,mapy); + int x0=sx, y0=sy; + int x1=sx+tm->spacing_x, y1=sy+tm->spacing_y; + stbte__hittest(x0,y0,x1,y1, id); short *data = tm->data[mapy][mapx]; short temp[STBTE_MAX_LAYERS]; @@ -2996,7 +3017,7 @@ static void stbte__tile_paint(stbte_tilemap *tm, int sx, int sy, int mapx, int m i = layer; if (i == tm->solo_layer || (!tm->layerinfo[i].hidden && tm->solo_layer < 0)) if (data[i] >= 0) - STBTE_DRAW_TILE(x0,y0, (unsigned short) data[i], 0, tm->props[mapy][mapx]); + STBTE_DRAW_TILE(sx,sy, (unsigned short) data[i], 0, tm->props[mapy][mapx]); } } @@ -3492,11 +3513,14 @@ static void stbte__categories(stbte_tilemap *tm, int x0, int y0, int w, int h) static void stbte__tile_in_palette(stbte_tilemap *tm, int x, int y, int slot) { + stbte__tileinfo *t = &tm->tiles[slot]; + int x0=x, y0=y, x1 = x+tm->palette_spacing_x - 1, y1 = y+tm->palette_spacing_y; int id = STBTE__ID(STBTE__palette, slot); + stbte__hittest(x0,y0,x1,y1, id); switch (stbte__ui.event) { case STBTE__paint: stbte__draw_rect(x,y,x+tm->palette_spacing_x-1,y+tm->palette_spacing_x-1, STBTE_COLOR_TILEPALETTE_BACKGROUND); - STBTE_DRAW_TILE(x,y,t->id, slot == tm->cur_tile,0); + STBTE_DRAW_TILE(x,y,id, slot == tm->cur_tile,0); if (slot == tm->cur_tile) stbte__draw_frame_delayed(x-1,y-1,x+tm->palette_spacing_x,y+tm->palette_spacing_y, STBTE_COLOR_TILEPALETTE_OUTLINE); break; @@ -3565,6 +3589,7 @@ static void stbte__props_panel(stbte_tilemap *tm, int x0, int y0, int w, int h) my = stbte__ui.select_y0; p = tm->props[my][mx]; data = tm->data[my][mx]; + STBTE__NOTUSED(data); for (i=0; i < STBTE_MAX_PROPERTIES; ++i) { unsigned int n = STBTE_PROP_TYPE(i, data, p); if (n) { @@ -3644,8 +3669,9 @@ static void stbte__props_panel(stbte_tilemap *tm, int x0, int y0, int w, int h) } } -static int stbte__cp_mode, stbte__cp_aspect, stbte__cp_state, stbte__cp_index, stbte__save, stbte__cp_altered, stbte__color_copy; +static int stbte__cp_mode, stbte__cp_aspect, stbte__save, stbte__cp_altered; #ifdef STBTE__COLORPICKER +static int stbte__cp_state, stbte__cp_index, stbte__color_copy; static void stbte__dump_colorstate(void) { int i,j,k; diff --git a/third_party/stb/stb_truetype.h b/third_party/stb/stb_truetype.h index 62595a1..90a5c2e 100644 --- a/third_party/stb/stb_truetype.h +++ b/third_party/stb/stb_truetype.h @@ -1,5 +1,5 @@ -// stb_truetype.h - v1.24 - public domain -// authored from 2009-2020 by Sean Barrett / RAD Game Tools +// stb_truetype.h - v1.26 - public domain +// authored from 2009-2021 by Sean Barrett / RAD Game Tools // // ======================================================================= // @@ -54,10 +54,12 @@ // Hou Qiming Derek Vinyard // Rob Loach Cort Stratton // Kenney Phillis Jr. Brian Costabile -// Ken Voskuil (kaesve) +// Ken Voskuil (kaesve) Yakov Galka // // VERSION HISTORY // +// 1.26 (2021-08-28) fix broken rasterizer +// 1.25 (2021-07-11) many fixes // 1.24 (2020-02-05) fix warning // 1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS) // 1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined @@ -270,8 +272,8 @@ //// SAMPLE PROGRAMS //// // -// Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless -// +// Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless. +// See "tests/truetype_demo_win32.c" for a complete version. #if 0 #define STB_TRUETYPE_IMPLEMENTATION // force following include to generate implementation #include "stb_truetype.h" @@ -297,6 +299,8 @@ void my_stbtt_initfont(void) void my_stbtt_print(float x, float y, char *text) { // assume orthographic projection with units = screen pixels, origin at top left + glEnable(GL_BLEND); + glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA); glEnable(GL_TEXTURE_2D); glBindTexture(GL_TEXTURE_2D, ftex); glBegin(GL_QUADS); @@ -304,10 +308,10 @@ void my_stbtt_print(float x, float y, char *text) if (*text >= 32 && *text < 128) { stbtt_aligned_quad q; stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl & d3d10+,0=d3d9 - glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0); - glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0); - glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1); - glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1); + glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y0); + glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y0); + glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y1); + glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y1); } ++text; } @@ -853,6 +857,7 @@ STBTT_DEF int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, s STBTT_DEF void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices); // frees the data allocated above +STBTT_DEF unsigned char *stbtt_FindSVGDoc(const stbtt_fontinfo *info, int gl); STBTT_DEF int stbtt_GetCodepointSVG(const stbtt_fontinfo *info, int unicode_codepoint, const char **svg); STBTT_DEF int stbtt_GetGlyphSVG(const stbtt_fontinfo *info, int gl, const char **svg); // fills svg with the character's SVG data. @@ -1539,12 +1544,12 @@ STBTT_DEF int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codep search += 2; { - stbtt_uint16 offset, start; + stbtt_uint16 offset, start, last; stbtt_uint16 item = (stbtt_uint16) ((search - endCount) >> 1); - STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item)); start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item); - if (unicode_codepoint < start) + last = ttUSHORT(data + endCount + 2*item); + if (unicode_codepoint < start || unicode_codepoint > last) return 0; offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item); @@ -1871,7 +1876,7 @@ static int stbtt__GetGlyphShapeTT(const stbtt_fontinfo *info, int glyph_index, s if (comp_verts) STBTT_free(comp_verts, info->userdata); return 0; } - if (num_vertices > 0) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex)); + if (num_vertices > 0 && vertices) STBTT_memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex)); STBTT_memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex)); if (vertices) STBTT_free(vertices, info->userdata); vertices = tmp; @@ -2134,7 +2139,7 @@ static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, st subrs = stbtt__cid_get_glyph_subrs(info, glyph_index); has_subrs = 1; } - // fallthrough + // FALLTHROUGH case 0x1D: // callgsubr if (sp < 1) return STBTT__CSERR("call(g|)subr stack"); v = (int) s[--sp]; @@ -2239,7 +2244,7 @@ static int stbtt__run_charstring(const stbtt_fontinfo *info, int glyph_index, st } break; default: - if (b0 != 255 && b0 != 28 && (b0 < 32 || b0 > 254)) + if (b0 != 255 && b0 != 28 && b0 < 32) return STBTT__CSERR("reserved operator"); // push immediate @@ -2351,7 +2356,7 @@ STBTT_DEF int stbtt_GetKerningTable(const stbtt_fontinfo *info, stbtt_kerningent return length; } -static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2) +static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2) { stbtt_uint8 *data = info->data + info->kern; stbtt_uint32 needle, straw; @@ -2381,243 +2386,225 @@ static int stbtt__GetGlyphKernInfoAdvance(const stbtt_fontinfo *info, int glyph return 0; } -static stbtt_int32 stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph) -{ - stbtt_uint16 coverageFormat = ttUSHORT(coverageTable); - switch(coverageFormat) { - case 1: { - stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2); - - // Binary search. - stbtt_int32 l=0, r=glyphCount-1, m; - int straw, needle=glyph; - while (l <= r) { - stbtt_uint8 *glyphArray = coverageTable + 4; - stbtt_uint16 glyphID; - m = (l + r) >> 1; - glyphID = ttUSHORT(glyphArray + 2 * m); - straw = glyphID; - if (needle < straw) - r = m - 1; - else if (needle > straw) - l = m + 1; - else { - return m; - } +static stbtt_int32 stbtt__GetCoverageIndex(stbtt_uint8 *coverageTable, int glyph) +{ + stbtt_uint16 coverageFormat = ttUSHORT(coverageTable); + switch (coverageFormat) { + case 1: { + stbtt_uint16 glyphCount = ttUSHORT(coverageTable + 2); + + // Binary search. + stbtt_int32 l=0, r=glyphCount-1, m; + int straw, needle=glyph; + while (l <= r) { + stbtt_uint8 *glyphArray = coverageTable + 4; + stbtt_uint16 glyphID; + m = (l + r) >> 1; + glyphID = ttUSHORT(glyphArray + 2 * m); + straw = glyphID; + if (needle < straw) + r = m - 1; + else if (needle > straw) + l = m + 1; + else { + return m; } - } break; - - case 2: { - stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2); - stbtt_uint8 *rangeArray = coverageTable + 4; - - // Binary search. - stbtt_int32 l=0, r=rangeCount-1, m; - int strawStart, strawEnd, needle=glyph; - while (l <= r) { - stbtt_uint8 *rangeRecord; - m = (l + r) >> 1; - rangeRecord = rangeArray + 6 * m; - strawStart = ttUSHORT(rangeRecord); - strawEnd = ttUSHORT(rangeRecord + 2); - if (needle < strawStart) - r = m - 1; - else if (needle > strawEnd) - l = m + 1; - else { - stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4); - return startCoverageIndex + glyph - strawStart; - } + } + break; + } + + case 2: { + stbtt_uint16 rangeCount = ttUSHORT(coverageTable + 2); + stbtt_uint8 *rangeArray = coverageTable + 4; + + // Binary search. + stbtt_int32 l=0, r=rangeCount-1, m; + int strawStart, strawEnd, needle=glyph; + while (l <= r) { + stbtt_uint8 *rangeRecord; + m = (l + r) >> 1; + rangeRecord = rangeArray + 6 * m; + strawStart = ttUSHORT(rangeRecord); + strawEnd = ttUSHORT(rangeRecord + 2); + if (needle < strawStart) + r = m - 1; + else if (needle > strawEnd) + l = m + 1; + else { + stbtt_uint16 startCoverageIndex = ttUSHORT(rangeRecord + 4); + return startCoverageIndex + glyph - strawStart; } - } break; + } + break; + } - default: { - // There are no other cases. - STBTT_assert(0); - } break; - } + default: return -1; // unsupported + } - return -1; + return -1; } static stbtt_int32 stbtt__GetGlyphClass(stbtt_uint8 *classDefTable, int glyph) { - stbtt_uint16 classDefFormat = ttUSHORT(classDefTable); - switch(classDefFormat) - { - case 1: { - stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2); - stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4); - stbtt_uint8 *classDef1ValueArray = classDefTable + 6; - - if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount) - return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID)); - - classDefTable = classDef1ValueArray + 2 * glyphCount; - } break; - - case 2: { - stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2); - stbtt_uint8 *classRangeRecords = classDefTable + 4; - - // Binary search. - stbtt_int32 l=0, r=classRangeCount-1, m; - int strawStart, strawEnd, needle=glyph; - while (l <= r) { - stbtt_uint8 *classRangeRecord; - m = (l + r) >> 1; - classRangeRecord = classRangeRecords + 6 * m; - strawStart = ttUSHORT(classRangeRecord); - strawEnd = ttUSHORT(classRangeRecord + 2); - if (needle < strawStart) - r = m - 1; - else if (needle > strawEnd) - l = m + 1; - else - return (stbtt_int32)ttUSHORT(classRangeRecord + 4); - } + stbtt_uint16 classDefFormat = ttUSHORT(classDefTable); + switch (classDefFormat) + { + case 1: { + stbtt_uint16 startGlyphID = ttUSHORT(classDefTable + 2); + stbtt_uint16 glyphCount = ttUSHORT(classDefTable + 4); + stbtt_uint8 *classDef1ValueArray = classDefTable + 6; - classDefTable = classRangeRecords + 6 * classRangeCount; - } break; + if (glyph >= startGlyphID && glyph < startGlyphID + glyphCount) + return (stbtt_int32)ttUSHORT(classDef1ValueArray + 2 * (glyph - startGlyphID)); + break; + } - default: { - // There are no other cases. - STBTT_assert(0); - } break; - } + case 2: { + stbtt_uint16 classRangeCount = ttUSHORT(classDefTable + 2); + stbtt_uint8 *classRangeRecords = classDefTable + 4; + + // Binary search. + stbtt_int32 l=0, r=classRangeCount-1, m; + int strawStart, strawEnd, needle=glyph; + while (l <= r) { + stbtt_uint8 *classRangeRecord; + m = (l + r) >> 1; + classRangeRecord = classRangeRecords + 6 * m; + strawStart = ttUSHORT(classRangeRecord); + strawEnd = ttUSHORT(classRangeRecord + 2); + if (needle < strawStart) + r = m - 1; + else if (needle > strawEnd) + l = m + 1; + else + return (stbtt_int32)ttUSHORT(classRangeRecord + 4); + } + break; + } - return -1; + default: + return -1; // Unsupported definition type, return an error. + } + + // "All glyphs not assigned to a class fall into class 0". (OpenType spec) + return 0; } // Define to STBTT_assert(x) if you want to break on unimplemented formats. #define STBTT_GPOS_TODO_assert(x) -static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2) -{ - stbtt_uint16 lookupListOffset; - stbtt_uint8 *lookupList; - stbtt_uint16 lookupCount; - stbtt_uint8 *data; - stbtt_int32 i; - - if (!info->gpos) return 0; - - data = info->data + info->gpos; - - if (ttUSHORT(data+0) != 1) return 0; // Major version 1 - if (ttUSHORT(data+2) != 0) return 0; // Minor version 0 - - lookupListOffset = ttUSHORT(data+8); - lookupList = data + lookupListOffset; - lookupCount = ttUSHORT(lookupList); - - for (i=0; i> 1; - pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m; - secondGlyph = ttUSHORT(pairValue); - straw = secondGlyph; - if (needle < straw) - r = m - 1; - else if (needle > straw) - l = m + 1; - else { - stbtt_int16 xAdvance = ttSHORT(pairValue + 2); - return xAdvance; - } - } - } break; - - case 2: { - stbtt_uint16 valueFormat1 = ttUSHORT(table + 4); - stbtt_uint16 valueFormat2 = ttUSHORT(table + 6); - - stbtt_uint16 classDef1Offset = ttUSHORT(table + 8); - stbtt_uint16 classDef2Offset = ttUSHORT(table + 10); - int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1); - int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2); - - stbtt_uint16 class1Count = ttUSHORT(table + 12); - stbtt_uint16 class2Count = ttUSHORT(table + 14); - STBTT_assert(glyph1class < class1Count); - STBTT_assert(glyph2class < class2Count); - - // TODO: Support more formats. - STBTT_GPOS_TODO_assert(valueFormat1 == 4); - if (valueFormat1 != 4) return 0; - STBTT_GPOS_TODO_assert(valueFormat2 == 0); - if (valueFormat2 != 0) return 0; - - if (glyph1class >= 0 && glyph1class < class1Count && glyph2class >= 0 && glyph2class < class2Count) { - stbtt_uint8 *class1Records = table + 16; - stbtt_uint8 *class2Records = class1Records + 2 * (glyph1class * class2Count); - stbtt_int16 xAdvance = ttSHORT(class2Records + 2 * glyph2class); - return xAdvance; - } - } break; - - default: { - // There are no other cases. - STBTT_assert(0); - break; - }; - } - } - break; - }; +static stbtt_int32 stbtt__GetGlyphGPOSInfoAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2) +{ + stbtt_uint16 lookupListOffset; + stbtt_uint8 *lookupList; + stbtt_uint16 lookupCount; + stbtt_uint8 *data; + stbtt_int32 i, sti; + + if (!info->gpos) return 0; + + data = info->data + info->gpos; + + if (ttUSHORT(data+0) != 1) return 0; // Major version 1 + if (ttUSHORT(data+2) != 0) return 0; // Minor version 0 + + lookupListOffset = ttUSHORT(data+8); + lookupList = data + lookupListOffset; + lookupCount = ttUSHORT(lookupList); + + for (i=0; i= pairSetCount) return 0; + + needle=glyph2; + r=pairValueCount-1; + l=0; + + // Binary search. + while (l <= r) { + stbtt_uint16 secondGlyph; + stbtt_uint8 *pairValue; + m = (l + r) >> 1; + pairValue = pairValueArray + (2 + valueRecordPairSizeInBytes) * m; + secondGlyph = ttUSHORT(pairValue); + straw = secondGlyph; + if (needle < straw) + r = m - 1; + else if (needle > straw) + l = m + 1; + else { + stbtt_int16 xAdvance = ttSHORT(pairValue + 2); + return xAdvance; + } + } + } else + return 0; + break; + } + + case 2: { + stbtt_uint16 valueFormat1 = ttUSHORT(table + 4); + stbtt_uint16 valueFormat2 = ttUSHORT(table + 6); + if (valueFormat1 == 4 && valueFormat2 == 0) { // Support more formats? + stbtt_uint16 classDef1Offset = ttUSHORT(table + 8); + stbtt_uint16 classDef2Offset = ttUSHORT(table + 10); + int glyph1class = stbtt__GetGlyphClass(table + classDef1Offset, glyph1); + int glyph2class = stbtt__GetGlyphClass(table + classDef2Offset, glyph2); + + stbtt_uint16 class1Count = ttUSHORT(table + 12); + stbtt_uint16 class2Count = ttUSHORT(table + 14); + stbtt_uint8 *class1Records, *class2Records; + stbtt_int16 xAdvance; + + if (glyph1class < 0 || glyph1class >= class1Count) return 0; // malformed + if (glyph2class < 0 || glyph2class >= class2Count) return 0; // malformed + + class1Records = table + 16; + class2Records = class1Records + 2 * (glyph1class * class2Count); + xAdvance = ttSHORT(class2Records + 2 * glyph2class); + return xAdvance; + } else + return 0; + break; + } default: - // TODO: Implement other stuff. - break; - } - } + return 0; // Unsupported position format + } + } + } - return 0; + return 0; } STBTT_DEF int stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int g1, int g2) @@ -3075,6 +3062,23 @@ static void stbtt__handle_clipped_edge(float *scanline, int x, stbtt__active_edg } } +static float stbtt__sized_trapezoid_area(float height, float top_width, float bottom_width) +{ + STBTT_assert(top_width >= 0); + STBTT_assert(bottom_width >= 0); + return (top_width + bottom_width) / 2.0f * height; +} + +static float stbtt__position_trapezoid_area(float height, float tx0, float tx1, float bx0, float bx1) +{ + return stbtt__sized_trapezoid_area(height, tx1 - tx0, bx1 - bx0); +} + +static float stbtt__sized_triangle_area(float height, float width) +{ + return height * width / 2; +} + static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, int len, stbtt__active_edge *e, float y_top) { float y_bottom = y_top+1; @@ -3129,13 +3133,13 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, float height; // simple case, only spans one pixel int x = (int) x_top; - height = sy1 - sy0; + height = (sy1 - sy0) * e->direction; STBTT_assert(x >= 0 && x < len); - scanline[x] += e->direction * (1-((x_top - x) + (x_bottom-x))/2) * height; - scanline_fill[x] += e->direction * height; // everything right of this pixel is filled + scanline[x] += stbtt__position_trapezoid_area(height, x_top, x+1.0f, x_bottom, x+1.0f); + scanline_fill[x] += height; // everything right of this pixel is filled } else { int x,x1,x2; - float y_crossing, step, sign, area; + float y_crossing, y_final, step, sign, area; // covers 2+ pixels if (x_top > x_bottom) { // flip scanline vertically; signed area is the same @@ -3148,29 +3152,79 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, dy = -dy; t = x0, x0 = xb, xb = t; } + STBTT_assert(dy >= 0); + STBTT_assert(dx >= 0); x1 = (int) x_top; x2 = (int) x_bottom; // compute intersection with y axis at x1+1 - y_crossing = (x1+1 - x0) * dy + y_top; + y_crossing = y_top + dy * (x1+1 - x0); + + // compute intersection with y axis at x2 + y_final = y_top + dy * (x2 - x0); + + // x1 x_top x2 x_bottom + // y_top +------|-----+------------+------------+--------|---+------------+ + // | | | | | | + // | | | | | | + // sy0 | Txxxxx|............|............|............|............| + // y_crossing | *xxxxx.......|............|............|............| + // | | xxxxx..|............|............|............| + // | | /- xx*xxxx........|............|............| + // | | dy < | xxxxxx..|............|............| + // y_final | | \- | xx*xxx.........|............| + // sy1 | | | | xxxxxB...|............| + // | | | | | | + // | | | | | | + // y_bottom +------------+------------+------------+------------+------------+ + // + // goal is to measure the area covered by '.' in each pixel + + // if x2 is right at the right edge of x1, y_crossing can blow up, github #1057 + // @TODO: maybe test against sy1 rather than y_bottom? + if (y_crossing > y_bottom) + y_crossing = y_bottom; sign = e->direction; - // area of the rectangle covered from y0..y_crossing + + // area of the rectangle covered from sy0..y_crossing area = sign * (y_crossing-sy0); - // area of the triangle (x_top,y0), (x+1,y0), (x+1,y_crossing) - scanline[x1] += area * (1-((x_top - x1)+(x1+1-x1))/2); - step = sign * dy; + // area of the triangle (x_top,sy0), (x1+1,sy0), (x1+1,y_crossing) + scanline[x1] += stbtt__sized_triangle_area(area, x1+1 - x_top); + + // check if final y_crossing is blown up; no test case for this + if (y_final > y_bottom) { + y_final = y_bottom; + dy = (y_final - y_crossing ) / (x2 - (x1+1)); // if denom=0, y_final = y_crossing, so y_final <= y_bottom + } + + // in second pixel, area covered by line segment found in first pixel + // is always a rectangle 1 wide * the height of that line segment; this + // is exactly what the variable 'area' stores. it also gets a contribution + // from the line segment within it. the THIRD pixel will get the first + // pixel's rectangle contribution, the second pixel's rectangle contribution, + // and its own contribution. the 'own contribution' is the same in every pixel except + // the leftmost and rightmost, a trapezoid that slides down in each pixel. + // the second pixel's contribution to the third pixel will be the + // rectangle 1 wide times the height change in the second pixel, which is dy. + + step = sign * dy * 1; // dy is dy/dx, change in y for every 1 change in x, + // which multiplied by 1-pixel-width is how much pixel area changes for each step in x + // so the area advances by 'step' every time + for (x = x1+1; x < x2; ++x) { - scanline[x] += area + step/2; + scanline[x] += area + step/2; // area of trapezoid is 1*step/2 area += step; } - y_crossing += dy * (x2 - (x1+1)); - - STBTT_assert(STBTT_fabs(area) <= 1.01f); + STBTT_assert(STBTT_fabs(area) <= 1.01f); // accumulated error from area += step unless we round step down + STBTT_assert(sy1 > y_final-0.01f); - scanline[x2] += area + sign * (1-((x2-x2)+(x_bottom-x2))/2) * (sy1-y_crossing); + // area covered in the last pixel is the rectangle from all the pixels to the left, + // plus the trapezoid filled by the line segment in this pixel all the way to the right edge + scanline[x2] += area + sign * stbtt__position_trapezoid_area(sy1-y_final, (float) x2, x2+1.0f, x_bottom, x2+1.0f); + // the rest of the line is filled based on the total height of the line segment in this pixel scanline_fill[x2] += sign * (sy1-sy0); } } else { @@ -3178,6 +3232,9 @@ static void stbtt__fill_active_edges_new(float *scanline, float *scanline_fill, // clipping logic. since this does not match the intended use // of this library, we use a different, very slow brute // force implementation + // note though that this does happen some of the time because + // x_top and x_bottom can be extrapolated at the top & bottom of + // the shape and actually lie outside the bounding box int x; for (x=0; x < len; ++x) { // cases: @@ -4414,15 +4471,14 @@ static int stbtt__compute_crossings_x(float x, float y, int nverts, stbtt_vertex float y_frac; int winding = 0; - orig[0] = x; - orig[1] = y; - // make sure y never passes through a vertex of the shape y_frac = (float) STBTT_fmod(y, 1.0f); if (y_frac < 0.01f) y += 0.01f; else if (y_frac > 0.99f) y -= 0.01f; + + orig[0] = x; orig[1] = y; // test a ray from (-infinity,y) to (x,y) @@ -4484,35 +4540,35 @@ static float stbtt__cuberoot( float x ) return (float) STBTT_pow( x,1.0f/3.0f); } -// x^3 + c*x^2 + b*x + a = 0 +// x^3 + a*x^2 + b*x + c = 0 static int stbtt__solve_cubic(float a, float b, float c, float* r) { - float s = -a / 3; - float p = b - a*a / 3; - float q = a * (2*a*a - 9*b) / 27 + c; + float s = -a / 3; + float p = b - a*a / 3; + float q = a * (2*a*a - 9*b) / 27 + c; float p3 = p*p*p; - float d = q*q + 4*p3 / 27; - if (d >= 0) { - float z = (float) STBTT_sqrt(d); - float u = (-q + z) / 2; - float v = (-q - z) / 2; - u = stbtt__cuberoot(u); - v = stbtt__cuberoot(v); - r[0] = s + u + v; - return 1; - } else { - float u = (float) STBTT_sqrt(-p/3); - float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative - float m = (float) STBTT_cos(v); + float d = q*q + 4*p3 / 27; + if (d >= 0) { + float z = (float) STBTT_sqrt(d); + float u = (-q + z) / 2; + float v = (-q - z) / 2; + u = stbtt__cuberoot(u); + v = stbtt__cuberoot(v); + r[0] = s + u + v; + return 1; + } else { + float u = (float) STBTT_sqrt(-p/3); + float v = (float) STBTT_acos(-STBTT_sqrt(-27/p3) * q / 2) / 3; // p3 must be negative, since d is negative + float m = (float) STBTT_cos(v); float n = (float) STBTT_cos(v-3.141592/2)*1.732050808f; - r[0] = s + u * 2 * m; - r[1] = s - u * (m + n); - r[2] = s - u * (m - n); + r[0] = s + u * 2 * m; + r[1] = s - u * (m + n); + r[2] = s - u * (m - n); //STBTT_assert( STBTT_fabs(((r[0]+a)*r[0]+b)*r[0]+c) < 0.05f); // these asserts may not be safe at all scales, though they're in bezier t parameter units so maybe? //STBTT_assert( STBTT_fabs(((r[1]+a)*r[1]+b)*r[1]+c) < 0.05f); //STBTT_assert( STBTT_fabs(((r[2]+a)*r[2]+b)*r[2]+c) < 0.05f); - return 3; + return 3; } } @@ -4548,6 +4604,8 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc scale_y = -scale_y; { + // distance from singular values (in the same units as the pixel grid) + const float eps = 1./1024, eps2 = eps*eps; int x,y,i,j; float *precompute; stbtt_vertex *verts; @@ -4560,15 +4618,15 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y; float x1 = verts[j].x*scale_x, y1 = verts[j].y*scale_y; float dist = (float) STBTT_sqrt((x1-x0)*(x1-x0) + (y1-y0)*(y1-y0)); - precompute[i] = (dist == 0) ? 0.0f : 1.0f / dist; + precompute[i] = (dist < eps) ? 0.0f : 1.0f / dist; } else if (verts[i].type == STBTT_vcurve) { float x2 = verts[j].x *scale_x, y2 = verts[j].y *scale_y; float x1 = verts[i].cx*scale_x, y1 = verts[i].cy*scale_y; float x0 = verts[i].x *scale_x, y0 = verts[i].y *scale_y; float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2; float len2 = bx*bx + by*by; - if (len2 != 0.0f) - precompute[i] = 1.0f / (bx*bx + by*by); + if (len2 >= eps2) + precompute[i] = 1.0f / len2; else precompute[i] = 0.0f; } else @@ -4589,18 +4647,17 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc for (i=0; i < num_verts; ++i) { float x0 = verts[i].x*scale_x, y0 = verts[i].y*scale_y; - // check against every point here rather than inside line/curve primitives -- @TODO: wrong if multiple 'moves' in a row produce a garbage point, and given culling, probably more efficient to do within line/curve - float dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy); - if (dist2 < min_dist*min_dist) - min_dist = (float) STBTT_sqrt(dist2); - - if (verts[i].type == STBTT_vline) { + if (verts[i].type == STBTT_vline && precompute[i] != 0.0f) { float x1 = verts[i-1].x*scale_x, y1 = verts[i-1].y*scale_y; + float dist,dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy); + if (dist2 < min_dist*min_dist) + min_dist = (float) STBTT_sqrt(dist2); + // coarse culling against bbox //if (sx > STBTT_min(x0,x1)-min_dist && sx < STBTT_max(x0,x1)+min_dist && // sy > STBTT_min(y0,y1)-min_dist && sy < STBTT_max(y0,y1)+min_dist) - float dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i]; + dist = (float) STBTT_fabs((x1-x0)*(y0-sy) - (y1-y0)*(x0-sx)) * precompute[i]; STBTT_assert(i != 0); if (dist < min_dist) { // check position along line @@ -4627,14 +4684,15 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc float ax = x1-x0, ay = y1-y0; float bx = x0 - 2*x1 + x2, by = y0 - 2*y1 + y2; float mx = x0 - sx, my = y0 - sy; - float res[3],px,py,t,it; + float res[3] = {0.f,0.f,0.f}; + float px,py,t,it,dist2; float a_inv = precompute[i]; if (a_inv == 0.0) { // if a_inv is 0, it's 2nd degree so use quadratic formula float a = 3*(ax*bx + ay*by); float b = 2*(ax*ax + ay*ay) + (mx*bx+my*by); float c = mx*ax+my*ay; - if (a == 0.0) { // if a is 0, it's linear - if (b != 0.0) { + if (STBTT_fabs(a) < eps2) { // if a is 0, it's linear + if (STBTT_fabs(b) >= eps2) { res[num++] = -c/b; } } else { @@ -4654,6 +4712,10 @@ STBTT_DEF unsigned char * stbtt_GetGlyphSDF(const stbtt_fontinfo *info, float sc float d = (mx*ax+my*ay) * a_inv; num = stbtt__solve_cubic(b, c, d, res); } + dist2 = (x0-sx)*(x0-sx) + (y0-sy)*(y0-sy); + if (dist2 < min_dist*min_dist) + min_dist = (float) STBTT_sqrt(dist2); + if (num >= 1 && res[0] >= 0.0f && res[0] <= 1.0f) { t = res[0], it = 1.0f - t; px = it*it*x0 + 2*t*it*x1 + t*t*x2; @@ -4913,6 +4975,12 @@ STBTT_DEF int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const // FULL VERSION HISTORY // +// 1.25 (2021-07-11) many fixes +// 1.24 (2020-02-05) fix warning +// 1.23 (2020-02-02) query SVG data for glyphs; query whole kerning table (but only kern not GPOS) +// 1.22 (2019-08-11) minimize missing-glyph duplication; fix kerning if both 'GPOS' and 'kern' are defined +// 1.21 (2019-02-25) fix warning +// 1.20 (2019-02-07) PackFontRange skips missing codepoints; GetScaleFontVMetrics() // 1.19 (2018-02-11) OpenType GPOS kerning (horizontal only), STBTT_fmod // 1.18 (2018-01-29) add missing function // 1.17 (2017-07-23) make more arguments const; doc fix diff --git a/third_party/stb/stb_vorbis.c b/third_party/stb/stb_vorbis.c index a8cbfa6..3e5c250 100644 --- a/third_party/stb/stb_vorbis.c +++ b/third_party/stb/stb_vorbis.c @@ -1,4 +1,4 @@ -// Ogg Vorbis audio decoder - v1.20 - public domain +// Ogg Vorbis audio decoder - v1.22 - public domain // http://nothings.org/stb_vorbis/ // // Original version written by Sean Barrett in 2007. @@ -29,12 +29,15 @@ // Bernhard Wodo Evan Balster github:alxprd // Tom Beaumont Ingo Leitgeb Nicolas Guillemot // Phillip Bennefall Rohit Thiago Goulart -// github:manxorist saga musix github:infatum +// github:manxorist Saga Musix github:infatum // Timur Gagiev Maxwell Koo Peter Waller // github:audinowho Dougall Johnson David Reid // github:Clownacy Pedro J. Estebanez Remi Verschelde +// AnthoFoxo github:morlat Gabriel Ravier // // Partial history: +// 1.22 - 2021-07-11 - various small fixes +// 1.21 - 2021-07-02 - fix bug for files with no comments // 1.20 - 2020-07-11 - several small fixes // 1.19 - 2020-02-05 - warnings // 1.18 - 2020-02-02 - fix seek bugs; parse header comments; misc warnings etc. @@ -220,6 +223,12 @@ extern int stb_vorbis_decode_frame_pushdata( // channel. In other words, (*output)[0][0] contains the first sample from // the first channel, and (*output)[1][0] contains the first sample from // the second channel. +// +// *output points into stb_vorbis's internal output buffer storage; these +// buffers are owned by stb_vorbis and application code should not free +// them or modify their contents. They are transient and will be overwritten +// once you ask for more data to get decoded, so be sure to grab any data +// you need before then. extern void stb_vorbis_flush_pushdata(stb_vorbis *f); // inform stb_vorbis that your next datablock will not be contiguous with @@ -579,7 +588,7 @@ enum STBVorbisError #if defined(_MSC_VER) || defined(__MINGW32__) #include #endif - #if defined(__linux__) || defined(__linux) || defined(__EMSCRIPTEN__) || defined(__NEWLIB__) + #if defined(__linux__) || defined(__linux) || defined(__sun__) || defined(__EMSCRIPTEN__) || defined(__NEWLIB__) #include #endif #else // STB_VORBIS_NO_CRT @@ -646,6 +655,12 @@ typedef signed int int32; typedef float codetype; +#ifdef _MSC_VER +#define STBV_NOTUSED(v) (void)(v) +#else +#define STBV_NOTUSED(v) (void)sizeof(v) +#endif + // @NOTE // // Some arrays below are tagged "//varies", which means it's actually @@ -1046,7 +1061,7 @@ static float float32_unpack(uint32 x) uint32 sign = x & 0x80000000; uint32 exp = (x & 0x7fe00000) >> 21; double res = sign ? -(double)mantissa : (double)mantissa; - return (float) ldexp((float)res, exp-788); + return (float) ldexp((float)res, (int)exp-788); } @@ -1077,6 +1092,7 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values) // find the first entry for (k=0; k < n; ++k) if (len[k] < NO_CODE) break; if (k == n) { assert(c->sorted_entries == 0); return TRUE; } + assert(len[k] < 32); // no error return required, code reading lens checks this // add to the list add_entry(c, 0, k, m++, len[k], values); // add all available leaves @@ -1090,6 +1106,7 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values) uint32 res; int z = len[i], y; if (z == NO_CODE) continue; + assert(z < 32); // no error return required, code reading lens checks this // find lowest available leaf (should always be earliest, // which is what the specification calls for) // note that this property, and the fact we can never have @@ -1099,12 +1116,10 @@ static int compute_codewords(Codebook *c, uint8 *len, int n, uint32 *values) while (z > 0 && !available[z]) --z; if (z == 0) { return FALSE; } res = available[z]; - assert(z >= 0 && z < 32); available[z] = 0; add_entry(c, bit_reverse(res), i, m++, len[i], values); // propagate availability up the tree if (z != len[i]) { - assert(len[i] >= 0 && len[i] < 32); for (y=len[i]; y > z; --y) { assert(available[y] == 0); available[y] = res + (1 << (32-y)); @@ -2577,34 +2592,33 @@ static void imdct_step3_inner_s_loop_ld654(int n, float *e, int i_off, float *A, while (z > base) { float k00,k11; - - k00 = z[-0] - z[-8]; - k11 = z[-1] - z[-9]; - z[-0] = z[-0] + z[-8]; - z[-1] = z[-1] + z[-9]; - z[-8] = k00; - z[-9] = k11 ; - - k00 = z[ -2] - z[-10]; - k11 = z[ -3] - z[-11]; - z[ -2] = z[ -2] + z[-10]; - z[ -3] = z[ -3] + z[-11]; - z[-10] = (k00+k11) * A2; - z[-11] = (k11-k00) * A2; - - k00 = z[-12] - z[ -4]; // reverse to avoid a unary negation + float l00,l11; + + k00 = z[-0] - z[ -8]; + k11 = z[-1] - z[ -9]; + l00 = z[-2] - z[-10]; + l11 = z[-3] - z[-11]; + z[ -0] = z[-0] + z[ -8]; + z[ -1] = z[-1] + z[ -9]; + z[ -2] = z[-2] + z[-10]; + z[ -3] = z[-3] + z[-11]; + z[ -8] = k00; + z[ -9] = k11; + z[-10] = (l00+l11) * A2; + z[-11] = (l11-l00) * A2; + + k00 = z[ -4] - z[-12]; k11 = z[ -5] - z[-13]; + l00 = z[ -6] - z[-14]; + l11 = z[ -7] - z[-15]; z[ -4] = z[ -4] + z[-12]; z[ -5] = z[ -5] + z[-13]; - z[-12] = k11; - z[-13] = k00; - - k00 = z[-14] - z[ -6]; // reverse to avoid a unary negation - k11 = z[ -7] - z[-15]; z[ -6] = z[ -6] + z[-14]; z[ -7] = z[ -7] + z[-15]; - z[-14] = (k00+k11) * A2; - z[-15] = (k00-k11) * A2; + z[-12] = k11; + z[-13] = -k00; + z[-14] = (l11-l00) * A2; + z[-15] = (l00+l11) * -A2; iter_54(z); iter_54(z-8); @@ -3069,6 +3083,7 @@ static int do_floor(vorb *f, Mapping *map, int i, int n, float *target, YTYPE *f for (q=1; q < g->values; ++q) { j = g->sorted_order[q]; #ifndef STB_VORBIS_NO_DEFER_FLOOR + STBV_NOTUSED(step2_flag); if (finalY[j] >= 0) #else if (step2_flag[j]) @@ -3171,6 +3186,7 @@ static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start, // WINDOWING + STBV_NOTUSED(left_end); n = f->blocksize[m->blockflag]; map = &f->mapping[m->mapping]; @@ -3368,7 +3384,7 @@ static int vorbis_decode_packet_rest(vorb *f, int *len, Mode *m, int left_start, // this isn't to spec, but spec would require us to read ahead // and decode the size of all current frames--could be done, // but presumably it's not a commonly used feature - f->current_loc = -n2; // start of first frame is positioned for discard + f->current_loc = 0u - n2; // start of first frame is positioned for discard (NB this is an intentional unsigned overflow/wrap-around) // we might have to discard samples "from" the next frame too, // if we're lapping a large block then a small at the start? f->discard_samples_deferred = n - right_end; @@ -3642,8 +3658,12 @@ static int start_decoder(vorb *f) f->vendor[len] = (char)'\0'; //user comments f->comment_list_length = get32_packet(f); - f->comment_list = (char**)setup_malloc(f, sizeof(char*) * (f->comment_list_length)); - if (f->comment_list == NULL) return error(f, VORBIS_outofmem); + f->comment_list = NULL; + if (f->comment_list_length > 0) + { + f->comment_list = (char**) setup_malloc(f, sizeof(char*) * (f->comment_list_length)); + if (f->comment_list == NULL) return error(f, VORBIS_outofmem); + } for(i=0; i < f->comment_list_length; ++i) { len = get32_packet(f); @@ -3865,8 +3885,7 @@ static int start_decoder(vorb *f) unsigned int div=1; for (k=0; k < c->dimensions; ++k) { int off = (z / div) % c->lookup_values; - float val = mults[off]; - val = mults[off]*c->delta_value + c->minimum_value + last; + float val = mults[off]*c->delta_value + c->minimum_value + last; c->multiplicands[j*c->dimensions + k] = val; if (c->sequence_p) last = val; @@ -3949,7 +3968,7 @@ static int start_decoder(vorb *f) if (g->class_masterbooks[j] >= f->codebook_count) return error(f, VORBIS_invalid_setup); } for (k=0; k < 1 << g->class_subclasses[j]; ++k) { - g->subclass_books[j][k] = get_bits(f,8)-1; + g->subclass_books[j][k] = (int16)get_bits(f,8)-1; if (g->subclass_books[j][k] >= f->codebook_count) return error(f, VORBIS_invalid_setup); } } @@ -4507,6 +4526,7 @@ stb_vorbis *stb_vorbis_open_pushdata( *error = VORBIS_need_more_data; else *error = p.error; + vorbis_deinit(&p); return NULL; } f = vorbis_alloc(&p); @@ -4564,7 +4584,7 @@ static uint32 vorbis_find_page(stb_vorbis *f, uint32 *end, uint32 *last) header[i] = get8(f); if (f->eof) return 0; if (header[4] != 0) goto invalid; - goal = header[22] + (header[23] << 8) + (header[24]<<16) + (header[25]<<24); + goal = header[22] + (header[23] << 8) + (header[24]<<16) + ((uint32)header[25]<<24); for (i=22; i < 26; ++i) header[i] = 0; crc = 0; @@ -4968,7 +4988,7 @@ unsigned int stb_vorbis_stream_length_in_samples(stb_vorbis *f) // set. whoops! break; } - previous_safe = last_page_loc+1; + //previous_safe = last_page_loc+1; // NOTE: not used after this point, but note for debugging last_page_loc = stb_vorbis_get_file_offset(f); } @@ -5079,7 +5099,10 @@ stb_vorbis * stb_vorbis_open_filename(const char *filename, int *error, const st stb_vorbis * stb_vorbis_open_memory(const unsigned char *data, int len, int *error, const stb_vorbis_alloc *alloc) { stb_vorbis *f, p; - if (data == NULL) return NULL; + if (!data) { + if (error) *error = VORBIS_unexpected_eof; + return NULL; + } vorbis_init(&p, alloc); p.stream = (uint8 *) data; p.stream_end = (uint8 *) data + len; @@ -5154,11 +5177,11 @@ static void copy_samples(short *dest, float *src, int len) static void compute_samples(int mask, short *output, int num_c, float **data, int d_offset, int len) { - #define BUFFER_SIZE 32 - float buffer[BUFFER_SIZE]; - int i,j,o,n = BUFFER_SIZE; + #define STB_BUFFER_SIZE 32 + float buffer[STB_BUFFER_SIZE]; + int i,j,o,n = STB_BUFFER_SIZE; check_endianness(); - for (o = 0; o < len; o += BUFFER_SIZE) { + for (o = 0; o < len; o += STB_BUFFER_SIZE) { memset(buffer, 0, sizeof(buffer)); if (o + n > len) n = len - o; for (j=0; j < num_c; ++j) { @@ -5175,16 +5198,17 @@ static void compute_samples(int mask, short *output, int num_c, float **data, in output[o+i] = v; } } + #undef STB_BUFFER_SIZE } static void compute_stereo_samples(short *output, int num_c, float **data, int d_offset, int len) { - #define BUFFER_SIZE 32 - float buffer[BUFFER_SIZE]; - int i,j,o,n = BUFFER_SIZE >> 1; + #define STB_BUFFER_SIZE 32 + float buffer[STB_BUFFER_SIZE]; + int i,j,o,n = STB_BUFFER_SIZE >> 1; // o is the offset in the source data check_endianness(); - for (o = 0; o < len; o += BUFFER_SIZE >> 1) { + for (o = 0; o < len; o += STB_BUFFER_SIZE >> 1) { // o2 is the offset in the output data int o2 = o << 1; memset(buffer, 0, sizeof(buffer)); @@ -5214,6 +5238,7 @@ static void compute_stereo_samples(short *output, int num_c, float **data, int d output[o2+i] = v; } } + #undef STB_BUFFER_SIZE } static void convert_samples_short(int buf_c, short **buffer, int b_offset, int data_c, float **data, int d_offset, int samples) @@ -5286,8 +5311,6 @@ int stb_vorbis_get_samples_short_interleaved(stb_vorbis *f, int channels, short float **outputs; int len = num_shorts / channels; int n=0; - int z = f->channels; - if (z > channels) z = channels; while (n < len) { int k = f->channel_buffer_end - f->channel_buffer_start; if (n+k >= len) k = len - n; @@ -5306,8 +5329,6 @@ int stb_vorbis_get_samples_short(stb_vorbis *f, int channels, short **buffer, in { float **outputs; int n=0; - int z = f->channels; - if (z > channels) z = channels; while (n < len) { int k = f->channel_buffer_end - f->channel_buffer_start; if (n+k >= len) k = len - n;