/**
 *
 * @file core_sxx2fr.c
 *
 * PaStiX low-rank kernel routines that form the product of two matrices A and B
 * into a low-rank form for an update on a full rank matrix.
 *
 * @copyright 2016-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
 *                      Univ. Bordeaux. All rights reserved.
 *
 * @version 6.4.0
 * @author Mathieu Faverge
 * @author Gregoire Pichon
 * @author Pierre Ramet
 * @date 2024-07-05
 * @generated from /build/pastix/src/pastix-6.4.0/kernels/core_zxx2fr.c, normal z -> s, Sun Feb 15 18:01:31 2026
 *
 **/
#include "common.h"
#include <cblas.h>
#include "pastix_slrcores.h"
#include "kernels_trace.h"
#ifndef DOXYGEN_SHOULD_SKIP_THIS
static float sone  =  1.0;
static float szero =  0.0;
#endif /* DOXYGEN_SHOULD_SKIP_THIS */

/**
 *******************************************************************************
 *
 * @brief Perform the full-rank operation C = alpha * op(A) * op(B) + beta C
 *
 *******************************************************************************
 *
 * @param[inout] params
 *          The LRMM structure that stores all the parameters used in the LRMM
 *          functions family.
 *          On exit, the C matrix contains the product AB aligned with its own
 *          dimensions.
 *          @sa core_slrmm_t
 *
 *******************************************************************************
 *
 * @return The number of flops required to perform the operation.
 *
 *******************************************************************************/
pastix_fixdbl_t
core_sfrfr2fr( core_slrmm_t *params )
{
    pastix_int_t ldau, ldbu, ldcu;
    float *Cptr;
    pastix_fixdbl_t flops;
    PASTE_CORE_SLRMM_PARAMS( params );
    ldau = (transA == PastixNoTrans) ? M : K;
    ldbu = (transB == PastixNoTrans) ? K : N;
    ldcu = Cm;

    Cptr  = C->u;
    Cptr += ldcu * offy + offx;

    pastix_atomic_lock( lock );
    assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */

    /*
     * Everything is full rank we apply directly a GEMM
     */
    cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
                 M, N, K,
                 (alpha), A->u, ldau,
                                     B->u, ldbu,
                 (beta),  Cptr, ldcu );
    flops = FLOPS_SGEMM( M, N, K );

    pastix_atomic_unlock( lock );

    PASTE_CORE_SLRMM_VOID;
    return flops;
}

/**
 *******************************************************************************
 *
 * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with A and C
 * full-rank and B low-rank.
 *
 *******************************************************************************
 *
 * @param[inout] params
 *          The LRMM structure that stores all the parameters used in the LRMM
 *          functions family.
 *          On exit, the C matrix contains the product AB aligned with its own
 *          dimensions.
 *          @sa core_slrmm_t
 *
 *******************************************************************************
 *
 * @return The number of flops required to perform the operation.
 *
 *******************************************************************************/
pastix_fixdbl_t
core_sfrlr2fr( core_slrmm_t *params )
{
    PASTE_CORE_SLRMM_PARAMS( params );
    float *Cptr;
    pastix_int_t ldau, ldbu, ldbv, ldcu;
    pastix_fixdbl_t flops1 = FLOPS_SGEMM( M, B->rk, K ) + FLOPS_SGEMM( M, N, B->rk );
    pastix_fixdbl_t flops2 = FLOPS_SGEMM( K, N, B->rk ) + FLOPS_SGEMM( M, N, K     );
    pastix_fixdbl_t flops;
    int allocated = 0;
    PASTE_CORE_SLRMM_VOID;

    ldau = (transA == PastixNoTrans) ? M : K;
    ldbu = (transB == PastixNoTrans) ? K : N;
    ldbv = ( B->rk == -1 ) ? -1 : B->rkmax;

    ldcu = Cm;
    Cptr = C->u;
    Cptr += ldcu * offy + offx;

    /*
     *  A(M-by-K) * B( N-by-rb x rb-by-K )^t
     */
    if ( flops1 <= flops2 ) {
        if ( (work = core_slrmm_getws( params, M * B->rk )) == NULL ) {
            work = malloc( M * B->rk * sizeof(float) );
            allocated = 1;
        }

        /*
         *  (A * Bv) * Bu^t
         */
        cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
                     M, B->rk, K,
                     (sone),  A->u, ldau,
                                         B->v, ldbv,
                     (szero), work, M );

        pastix_atomic_lock( lock );
        assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
        cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
                     M, N, B->rk,
                     (alpha), work, M,
                                         B->u, ldbu,
                     (beta),  Cptr, ldcu );
        flops = flops1;
        pastix_atomic_unlock( lock );
    }
    else {
        if ( (work = core_slrmm_getws( params, K * N )) == NULL ) {
            work = malloc( K * N * sizeof(float) );
            allocated = 1;
        }

        /*
         *  A * (Bu * Bv^t)^t
         */
        cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                     K, N, B->rk,
                     (sone),  B->u, ldbu,
                                         B->v, ldbv,
                     (szero), work, K );

        pastix_atomic_lock( lock );
        assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
        cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
                     M, N, K,
                     (alpha), A->u, ldau,
                                         work, K,
                     (beta),  Cptr, ldcu );

        flops = flops2;
        pastix_atomic_unlock( lock );
    }

    if ( allocated ) {
        free( work );
    }
    return flops;
}

/**
 *******************************************************************************
 *
 * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with B and C
 * full-rank and A low-rank.
 *
 *******************************************************************************
 *
 * @param[inout] params
 *          The LRMM structure that stores all the parameters used in the LRMM
 *          functions family.
 *          On exit, the C matrix contains the product AB aligned with its own
 *          dimensions.
 *          @sa core_slrmm_t
 *
 *******************************************************************************
 *
 * @return The number of flops required to perform the operation.
 *
 *******************************************************************************/
pastix_fixdbl_t
core_slrfr2fr( core_slrmm_t *params )
{
    PASTE_CORE_SLRMM_PARAMS( params );
    float *Cptr;
    pastix_int_t ldau, ldav, ldbu, ldcu;
    pastix_fixdbl_t flops1 = FLOPS_SGEMM( A->rk, N, K ) + FLOPS_SGEMM( M, N, A->rk );
    pastix_fixdbl_t flops2 = FLOPS_SGEMM( M, K, A->rk ) + FLOPS_SGEMM( M, N, K     );
    pastix_fixdbl_t flops;
    int allocated = 0;
    PASTE_CORE_SLRMM_VOID;

    ldau = (transA == PastixNoTrans) ? M : K;
    ldav = ( A->rk == -1 ) ? -1 : A->rkmax;
    ldbu = (transB == PastixNoTrans) ? K : N;

    ldcu = Cm;
    Cptr = C->u;
    Cptr += ldcu * offy + offx;

    /*
     *  A( M-by-ra x ra-by-K ) * B(N-by-K)^t
     */
    if ( flops1 <= flops2 ) {
        if ( (work = core_slrmm_getws( params, A->rk * N )) == NULL ) {
            work = malloc( A->rk * N * sizeof(float) );
            allocated = 1;
        }

        /*
         *  Au * (Av^t * B^t)
         */
        cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)transB,
                     A->rk, N, K,
                     (sone),  A->v, ldav,
                                         B->u, ldbu,
                     (szero), work, A->rk );

        pastix_atomic_lock( lock );
        assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
        cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                     M, N, A->rk,
                     (alpha), A->u, ldau,
                                         work, A->rk,
                     (beta),  Cptr, ldcu );

        flops = flops1;
        pastix_atomic_unlock( lock );
    }
    else {
        if ( (work = core_slrmm_getws( params, M * K )) == NULL ) {
            work = malloc( M * K * sizeof(float) );
            allocated = 1;
        }

        /*
         *  (Au * Av^t) * B^t
         */
        cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
                     M, K, A->rk,
                     (sone),  A->u, ldau,
                                         A->v, ldav,
                     (szero), work, M );

        pastix_atomic_lock( lock );
        assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */
        cblas_sgemm( CblasColMajor, (CBLAS_TRANSPOSE)transA, (CBLAS_TRANSPOSE)transB,
                     M, N, K,
                     (alpha), work, M,
                                         B->u, ldbu,
                     (beta),  Cptr, ldcu );

        flops = flops2;
        pastix_atomic_unlock( lock );
    }

    if ( allocated ) {
        free( work );
    }
    return flops;
}

/**
 *******************************************************************************
 *
 * @brief Perform the operation C = alpha * op(A) * op(B) + beta C, with A and B
 * low-rank and C full-rank.
 *
 *******************************************************************************
 *
 * @param[inout] params
 *          The LRMM structure that stores all the parameters used in the LRMM
 *          functions family.
 *          On exit, the C matrix contains the product AB aligned with its own
 *          dimensions.
 *          @sa core_slrmm_t
 *
 *******************************************************************************
 *
 * @return The number of flops required to perform the operation.
 *
 *******************************************************************************/
pastix_fixdbl_t
core_slrlr2fr( core_slrmm_t *params )
{
    PASTE_CORE_SLRMM_PARAMS( params );
    float *Cptr;
    pastix_int_t        ldcu;
    pastix_lrblock_t    AB;
    pastix_trans_t      trans = PastixNoTrans;
    int                 infomask = 0;
    pastix_fixdbl_t     flops;

    ldcu = Cm;
    Cptr = C->u;
    Cptr += ldcu * offy + offx;

    flops = core_slrlr2lr( params, &AB, &infomask );
    assert( AB.rk != -1 );
    assert( AB.rkmax != -1 );

    if ( infomask & PASTIX_LRM3_TRANSB ) {
        trans = transB;
    }

    if ( AB.rk > 0 ) {
        pastix_int_t ldabv = (trans == PastixNoTrans) ? AB.rkmax : N;

        pastix_atomic_lock( lock );
        assert( C->rk == -1 ); /* Check that C has not changed due to parallelism */

        cblas_sgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                     M, N, AB.rk,
                     (alpha), AB.u, M,
                                         AB.v, ldabv,
                     (beta),  Cptr, ldcu );
        flops = FLOPS_SGEMM( M, N, AB.rk );
        pastix_atomic_unlock( lock );
    }

    /* Free memory from zlrm3 */
    if ( infomask & PASTIX_LRM3_ALLOCU ) {
        free(AB.u);
    }
    if ( infomask & PASTIX_LRM3_ALLOCV ) {
        free(AB.v);
    }

    PASTE_CORE_SLRMM_VOID;
    return flops;
}
