Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: leverage the SIMD Everywhere library
Forwarded: https://github.com/TimoLassmann/kalign/pull/20
--- kalign.orig/src/alignment.c
+++ kalign/src/alignment.c
@@ -20,7 +20,8 @@
 
 */
 
-#include <xmmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
 #include "alignment.h"
 
 
--- kalign.orig/src/bisectingKmeans.c
+++ kalign/src/bisectingKmeans.c
@@ -23,12 +23,16 @@
 #include <omp.h>
 #endif
 
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
+#if !defined(_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#else
+  #include <mm_malloc.h>
 #endif
 
-#include <mm_malloc.h>
-
 #include "tlrng.h"
 #include "msa.h"
 
@@ -472,13 +476,8 @@
                         score = 0.0f;
                         for(i = 0; i < num_samples;i++){
                                 s = samples[i];
-#ifdef HAVE_AVX2
                                 edist_256(dm[s], cl, num_anchors, &dl);
                                 edist_256(dm[s], cr, num_anchors, &dr);
-#else
-                                edist_serial(dm[s], cl, num_anchors, &dl);
-                                edist_serial(dm[s], cr, num_anchors, &dr);
-#endif
                                 score += MACRO_MIN(dl,dr);
 
                                 if(dr < dl){
--- kalign.orig/src/bpm.c
+++ kalign/src/bpm.c
@@ -25,10 +25,8 @@
 
 #include "tlrng.h"
 
-
-
-#ifdef HAVE_AVX2
-#include <immintrin.h>
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx2.h>
 
 __m256i BROADCAST_MASK[16];
 
@@ -37,7 +35,6 @@
 
 /* taken from Alexander Yee: http://www.numberworld.org/y-cruncher/internals/addition.html#ks_add */
  __m256i add256(uint32_t carry, __m256i A, __m256i B);
-#endif
 
 /* Below are test functions  */
 #ifdef BPM_UTEST
@@ -51,11 +48,9 @@
 uint8_t dyn_256_print(const uint8_t* t,const uint8_t* p,int n,int m);
 int  mutate_seq(uint8_t* s, int len,int k,int L, struct rng_state* rng);
 
-#ifdef HAVE_AVX2
 /* For debugging */
 void print_256(__m256i X);
 void print_256_all(__m256i X);
-#endif
 
 /* The actual test.  */
 int bpm_test(void);
@@ -64,9 +59,7 @@
 {
 
         /* Important set_broadcast_mask has to be called before using bpm_256!!! */
-#ifdef HAVE_AVX2
         set_broadcast_mask();
-#endif
         RUN(bpm_test());
         return EXIT_SUCCESS;
 ERROR:
@@ -149,11 +142,7 @@
                 for (j =0 ; j < test_iter; j++){
                         RUN(mutate_seq(b,len,i,alphabet->L,rng));
                         dyn_score = dyn_256(a,b,len,len);
-#ifdef HAVE_AVX2
                         bpm_score = bpm_256(a,b,len,len);
-#else
-                        bpm_score = dyn_score;
-#endif
                         if( abs( dyn_score - bpm_score) != 0){
                                 fprintf(stdout,"Scores differ: %d (dyn) %d (bpm) (%d out of %d)\n", dyn_score,bpm_score, calc_errors , total_calc);
                                 calc_errors++;
@@ -200,13 +189,9 @@
         for(i = 0; i < 100;i+=10){
                 RUN(mutate_seq(b,len,i,alphabet->L,rng));
 
-#ifdef HAVE_AVX2
                 for(j = 0; j < timing_iter;j++){
                         bpm_score = bpm_256(a,b,len,len);
                 }
-#else
-                bpm_score = dyn_score;
-#endif
 
 
                 //ASSERT(dyn_score == bpm_score, "Scores differ: %d %d.",dyn_score, bpm_score);
@@ -373,7 +358,6 @@
 
 }
 
-#ifdef HAVE_AVX2
 void print_256(__m256i X)
 {
         alignas(32) uint64_t debug[4];
@@ -394,7 +378,6 @@
 }
 
 #endif
-#endif
 
 
 
@@ -462,7 +445,6 @@
 }
 
 
-#ifdef HAVE_AVX2
 uint8_t bpm_256(const uint8_t* t,const uint8_t* p,int n,int m)
 {
         __m256i VP,VN,D0,HN,HP,X,NOTONE;
@@ -658,4 +640,3 @@
         carryOut   = _mm256_xor_si256 (innerCarry, rotate);                        //FIXME: not sure if this is correct!!!
         return carryOut;
 }
-#endif
--- kalign.orig/src/euclidean_dist.c
+++ kalign/src/euclidean_dist.c
@@ -22,22 +22,24 @@
 
 #include "euclidean_dist.h"
 #include "tlrng.h"
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
-#include <immintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx.h>
 
-#include <mm_malloc.h>
+#if !defined(_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#else
+  #include <mm_malloc.h>
+#endif
 
 #include "float.h"
 
 #include "esl_stopwatch.h"
 /* These functions were taken from:  */
 /* https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86 */
-#ifdef HAVE_AVX2
 float hsum256_ps_avx(__m256 v);
 float hsum_ps_sse3(__m128 v);
-#endif
 
 #ifdef ITEST_EDIST
 int main(void)
@@ -69,7 +71,6 @@
                 }
         }
         LOG_MSG("Check for correctness.");
-#ifdef HAVE_AVX2
         for(i = 0; i < 100;i++){
                 for(j = 0; j <= i;j++){
                         edist_serial(mat[i], mat[j], num_element, &d1);
@@ -80,7 +81,6 @@
                         }
                 }
         }
-#endif
         DECLARE_TIMER(t);
 
         LOG_MSG("Timing serial");
@@ -98,7 +98,6 @@
         GET_TIMING(t);
         //LOG_MSG("%f\tsec.",GET_TIMING(t));
 
-#ifdef HAVE_AVX2
         LOG_MSG("Timing AVX");
         START_TIMER(t);
         for(c = 0; c < max_iter; c++){
@@ -114,7 +113,6 @@
         GET_TIMING(t);
         //LOG_MSG("%f\tsec.",GET_TIMING(t));
 
-#endif
         for(i = 0; i < 100;i++){
                 _mm_free(mat[i]);
         }
@@ -162,7 +160,6 @@
         return OK;
 }
 
-#ifdef HAVE_AVX2
 
 int edist_256(const float* a,const float* b, const int len, float* ret)
 {
@@ -211,4 +208,3 @@
         return        _mm_cvtss_f32(sums);
 }
 
-#endif
--- kalign.orig/src/misc.c
+++ kalign/src/misc.c
@@ -23,14 +23,12 @@
 #include "tldevel.h"
 #include "tlrng.h"
 
-#ifdef HAVE_AVX2
-#include <immintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/avx.h>
 
 #include "misc.h"
 #include  <stdalign.h>
 #include <string.h>
-#include <immintrin.h>
 #ifdef ITEST_MISC
 
 
--- kalign.orig/src/sequence_distance.c
+++ kalign/src/sequence_distance.c
@@ -21,11 +21,16 @@
 
 */
 
-#ifdef HAVE_AVX2
-#include <xmmintrin.h>
-#endif
+#define SIMDE_ENABLE_NATIVE_ALIASES
+#include <simde/x86/sse.h>
 
-#include <mm_malloc.h>
+#if !defined(_SSE_NATIVE)
+  #include <stdlib.h>
+  #define _mm_malloc(size, align) aligned_alloc(align, size)
+  #define _mm_free free
+#else
+  #include <mm_malloc.h>
+#endif
 #include "sequence_distance.h"
 
 #include "alphabet.h"
@@ -64,9 +69,7 @@
         int len_b;
 
         int i,j;
-#if HAVE_AVX2
         set_broadcast_mask();
-#endif
 
         if(pair){
 
@@ -160,7 +163,6 @@
 
 float calc_distance(uint8_t* seq_a, uint8_t* seq_b, int len_a,int len_b, int L)
 {
-#ifdef HAVE_AVX2
         uint8_t dist;
         if(len_a > len_b){
                 dist = bpm_256(seq_a, seq_b, len_a, len_b);
@@ -168,51 +170,6 @@
                 dist = bpm_256(seq_b, seq_a, len_b, len_a);
         }
         return (float)dist;
-#else
-        struct bignode* hash[1024];
-        int i;
-        float dist;
-        unsigned int hv;
-        for (i = 0;i < 1024;i++){
-                hash[i] = 0;
-        }
-        /* Protein sequence  */
-        if( L > ALPHA_defDNA){
-
-                for (i = len_a-2;i--;){
-                        hv = (seq_a[i] << 5) + seq_a[i+1];
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = (seq_a[i] << 5) + seq_a[i+2];
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                }
-
-                dist = protein_wu_distance_calculation(hash,seq_b,len_b,len_a+len_b,58.9);
-        }else{
-
-                for (i = len_a-5;i--;){
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4)  + ((seq_a[i+3]&3)<<2) + (seq_a[i+4]&3);//ABCDE
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4)  + ((seq_a[i+3]&3)<<2) + (seq_a[i+5]&3);//ABCDF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+2]&3)<<4)  + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABCEF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+1]&3)<<6) + ((seq_a[i+3]&3)<<4)  + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ABDEF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                        hv = ((seq_a[i]&3)<<8) + ((seq_a[i+2]&3)<<6) + ((seq_a[i+3]&3)<<4) + ((seq_a[i+4]&3)<<2) + (seq_a[i+5]&3);//ACDEF
-                        hash[hv] = big_insert_hash(hash[hv],i);
-                }
-                dist = dna_distance_calculation(hash,seq_b,len_b,len_a+len_b, 61.08);
-        }
-
-
-        for (i = 1024;i--;){
-                if (hash[i]){
-                        big_remove_nodes(hash[i]);
-                        hash[i] = 0;
-                }
-        }
-        return dist;
-#endif
 
 }
 
