Developer Guide

Developer Guide for Intel® oneAPI Math Kernel Library Windows*

ID 766692
Date 3/22/2024
Public

A newer version of this document is available. Customers should click here to go to the newest version.

Document Table of Contents

CMake Config for oneMKL

If you want to integrate oneMKL into your CMake projects, starting with the Intel® oneAPI Math Kernel Library (oneMKL) 2021.3 release, MKLConfig.cmake is provided as part of the package and installation. MKLConfig.cmake supports all oneMKL configurations, compilers, and runtimes, as the oneMKL product itself. Help/usage is provided in the top section of MKLConfig.cmake.

Example

NOTE:
The following example demonstrates how to link with oneMKL using the C/Fortran API. For the oneMKL SYCL functionality, use the MKL::MKL_DPCPP target instead of MKL::MKL. For more information about how to link cluster functionality, enable the OpenMP Offload feature, specify the link type and threading type, and so forth, refer to the input and output parameters listed in the comments at the top of the MKLConfig.cmake file.
my_test/
  |_ build/                <-- Out-of-source build directory
  |_ CMakeLists.txt        <-- User side project's CMakeLists.txt
  |_ app.c                 <-- Source file that uses oneMKL API

app.c

#include <stdio.h>
#include "mkl.h"

int main()
{
    MKLVersion mkl_version;
    mkl_get_version(&mkl_version);

    printf("You are using oneMKL %d.%d\n", mkl_version.MajorVersion, mkl_version.UpdateVersion);

    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 3.13)

project(oneMKL_Example LANGUAGES C)

find_package(MKL CONFIG REQUIRED PATHS $ENV{MKLROOT})

add_executable(myapp app.c)
target_link_libraries(myapp PUBLIC MKL::MKL)

Command line

# Call the oneAPI setvars.bat/oneapi-vars.bat script beforehand
build>cmake .. -G Ninja -DCMAKE_C_COMPILER=icx
build>cmake --build .

If your application requires a customized setting, the following CMakeLists.txt demonstrates how to achieve a more fine-grained control.

CMakeLists.txt

cmake_minimum_required(VERSION 3.13)

project(oneMKL_Example LANGUAGES C)

find_package(MKL CONFIG REQUIRED PATHS $ENV{MKLROOT})
message(STATUS "Imported oneMKL targets: ${MKL_IMPORTED_TARGETS}")

add_executable(myapp app.c)
target_compile_options(myapp PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
target_include_directories(myapp PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
target_link_libraries(myapp PUBLIC $<LINK_ONLY:MKL::MKL>)
NOTE:

Ninja and NMake Makefiles are supported on Windows. Other CMake Generators may work but are not guaranteed.

When the Ninja build system is in use, Ninja 1.10.2 or later is required for Fortran support.

CBLAS Example

mkl_example.h

/*******************************************************************************
* Copyright 1999-2022 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
!  Content:
!      Intel(R) oneAPI Math Kernel Library (oneMKL) examples interface
!******************************************************************************/

#ifndef _MKL_EXAMPLE_H_
#define _MKL_EXAMPLE_H_

#include "mkl_types.h"
#include "mkl_cblas.h"
#include <stdio.h>

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#define COMMENTS ':'
#define MAX_STRING_LEN  512

#define MIN(a,b)  (((a) > (b)) ? (b) : (a))
#define MAX(a,b)  (((a) > (b)) ? (a) : (b))
#define ABS(a)    (((a) < 0) ? -(a) : (a))

#define GENERAL_MATRIX  0
#define UPPER_MATRIX    1
#define LOWER_MATRIX   -1

#define FULLPRINT  0
#define SHORTPRINT 1

#ifdef MKL_ILP64
  #define INT_FORMAT "%lld"
#else
  #define INT_FORMAT "%d"
#endif

/*
 * ===========================================
 * Prototypes for example program functions
 * ===========================================
 */

int GetIntegerParameters(FILE*, ...);
int GetIntegerParameters8(FILE*, ...);
int GetIntegerParameters16(FILE*, ...);
int GetIntegerParameters32(FILE*, ...);
int GetScalarsH(FILE*, ...);
int GetScalarsD(FILE*, ...);
int GetScalarsS(FILE*, ...);
int GetScalarsC(FILE*, ...);
int GetScalarsZ(FILE*, ...);
int GetCblasCharParameters(FILE *in_file, ...);
MKL_INT MaxValue(MKL_INT, MKL_INT*);
MKL_INT GetVectorI(FILE*, MKL_INT, MKL_INT*);
MKL_INT GetVectorI32(FILE*, MKL_INT, MKL_INT32*);
MKL_INT GetVectorS(FILE*, MKL_INT, float*, MKL_INT);
MKL_INT GetVectorC(FILE*, MKL_INT, MKL_Complex8*, MKL_INT);
MKL_INT GetVectorD(FILE*, MKL_INT, double*, MKL_INT);
MKL_INT GetVectorZ(FILE*, MKL_INT, MKL_Complex16*, MKL_INT);
MKL_INT GetArrayI8(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_INT8*, MKL_INT*);
MKL_INT GetArrayI16(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_INT16*, MKL_INT*);
MKL_INT GetArrayI32(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_INT32*, MKL_INT*);
MKL_INT GetArrayBF16(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_BF16*, MKL_INT*);
MKL_INT GetArrayH(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_F16*, MKL_INT*);
MKL_INT GetArrayS(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, float*, MKL_INT*);
MKL_INT GetArrayD(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, double*, MKL_INT*);
MKL_INT GetArrayC(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_Complex8*, MKL_INT*);
MKL_INT GetArrayZ(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT*, MKL_INT*, MKL_Complex16*, MKL_INT*);
MKL_INT GetBandArrayS(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT, MKL_INT, MKL_INT, float*, MKL_INT);
MKL_INT GetBandArrayD(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT, MKL_INT, MKL_INT, double*, MKL_INT);
MKL_INT GetBandArrayC(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT, MKL_INT, MKL_INT, MKL_Complex8*, MKL_INT);
MKL_INT GetBandArrayZ(FILE*, CBLAS_LAYOUT*, MKL_INT, MKL_INT, MKL_INT, MKL_INT, MKL_Complex16*, MKL_INT);
MKL_INT GetValuesI8(FILE*, MKL_INT8*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesI16(FILE*, MKL_INT16*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesI32(FILE*, MKL_INT32*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesI(FILE *, MKL_INT*, MKL_INT, MKL_INT);
MKL_INT GetValuesBF16(FILE *, MKL_BF16*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesH(FILE *, MKL_F16*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesC(FILE *, MKL_Complex8*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesZ(FILE *, MKL_Complex16*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesD(FILE *, double*, MKL_INT, MKL_INT, MKL_INT);
MKL_INT GetValuesS(FILE *, float*, MKL_INT, MKL_INT, MKL_INT);
float   GetValueH(MKL_F16);
void PrintParameters( char*, ... );
void PrintVectorI(MKL_INT, MKL_INT*, char*);
void PrintVectorI32(MKL_INT, MKL_INT32*, char*);
void PrintVectorS(int, MKL_INT, float*, MKL_INT, char*);
void PrintVectorC(int, MKL_INT, MKL_Complex8*, MKL_INT, char*);
void PrintVectorD(int, MKL_INT, double*, MKL_INT, char*);
void PrintVectorZ(int, MKL_INT, MKL_Complex16*,  MKL_INT, char*);
void PrintArrayI8(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_INT8*, MKL_INT*, char*);
void PrintArrayI16(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_INT16*, MKL_INT*, char*);
void PrintArrayI32(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_INT32*, MKL_INT*, char*);
void PrintArrayBF16(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_BF16*, MKL_INT*, char*);
void PrintArrayH(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_F16*, MKL_INT*, char*);
void PrintArrayS(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, float*, MKL_INT*, char*);
void PrintArrayD(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, double*, MKL_INT*, char*);
void PrintArrayC(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_Complex8*, MKL_INT*, char*);
void PrintArrayZ(CBLAS_LAYOUT*, int, int, MKL_INT*, MKL_INT*, MKL_Complex16*, MKL_INT*, char*);
void PrintBandArrayS(CBLAS_LAYOUT*, int, MKL_INT, MKL_INT, MKL_INT, MKL_INT, float*, MKL_INT, char*);
void PrintBandArrayD(CBLAS_LAYOUT*, int, MKL_INT, MKL_INT, MKL_INT, MKL_INT, double*, MKL_INT, char*);
void PrintBandArrayC(CBLAS_LAYOUT*, int, MKL_INT, MKL_INT, MKL_INT, MKL_INT, MKL_Complex8*, MKL_INT, char*);
void PrintBandArrayZ(CBLAS_LAYOUT*, int, MKL_INT, MKL_INT, MKL_INT, MKL_INT, MKL_Complex16*, MKL_INT, char*);

#ifdef __cplusplus
}
#endif /* __cplusplus */

#endif /* _MKL_EXAMPLE_H_ */

common_func.c

/*******************************************************************************
* Copyright 1999-2022 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
!  Content:
!
!******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#include <math.h>

#include "mkl_example.h"
#include "mkl_cblas.h"

MKL_INT MaxValue(MKL_INT n, MKL_INT *x)
{
      MKL_INT   i, indmax;

      indmax = x[0];
      for (i = 1; i < n; i++)
          if (indmax < x[i]) indmax = x[i];
      return indmax;
} /* MaxValue */

typedef union {
  float float_part;
  MKL_BF16 int_part[2];
} conv_union_bf16;

typedef union {
  MKL_F16  raw;
  struct {
    unsigned int frac : 10;
    unsigned int exp  :  5;
    unsigned int sign :  1;
  } bits;
} conv_union_f16;

typedef union {
  float raw;
  struct {
    unsigned int frac : 23;
    unsigned int exp  :  8;
    unsigned int sign :  1;
  } bits;
} conv_union_f32;

static float b2f(MKL_BF16 src) {
  conv_union_bf16 conv;
  conv.int_part[0] = 0;
  conv.int_part[1] = src;
  return conv.float_part;
}

static MKL_BF16 f2b(float src) {
  conv_union_bf16 conv;
  conv.float_part = src;
  return conv.int_part[1];
}

static float h2f(MKL_F16 x) { 
  conv_union_f16 src;
  conv_union_f32 dst;

  src.raw  = x;
  dst.raw  = 0;
  dst.bits.sign = src.bits.sign;

  if (src.bits.exp == 0x01fU) {
    dst.bits.exp  = 0xffU;
    if (src.bits.frac > 0) {
      dst.bits.frac = ((src.bits.frac | 0x200U) << 13);
    }
  } else if (src.bits.exp > 0x00U) {
    dst.bits.exp  = src.bits.exp + ((1 << 7) - (1 << 4));
    dst.bits.frac = (src.bits.frac << 13);
  } else {
    unsigned int v = (src.bits.frac << 13);

    if (v > 0) {
      dst.bits.exp = 0x71;
      while ((v & 0x800000UL) == 0) {
	dst.bits.exp --;
	v <<= 1;
      }
      dst.bits.frac = v;
    }
  }

  return dst.raw;
}

static MKL_F16 f2h(float x) { 
  conv_union_f32 src;
  conv_union_f16 dst;

  src.raw  = x;
  dst.raw  = 0;
  dst.bits.sign = src.bits.sign;

  if (src.bits.exp == 0x0ffU) {
    dst.bits.exp  = 0x01fU;
    dst.bits.frac = (src.bits.frac >> 13);
    if (src.bits.frac > 0) dst.bits.frac |= 0x200U;
  } else if (src.bits.exp >= 0x08fU) {
    dst.bits.exp  = 0x01fU;
    dst.bits.frac = 0x000U;
  } else if (src.bits.exp >= 0x071U){
    dst.bits.exp  = src.bits.exp + ((1 << 4) - (1 << 7));
    dst.bits.frac = (src.bits.frac >> 13);
  } else if (src.bits.exp >= 0x067U){
    dst.bits.exp  = 0x000;
    if (src.bits.frac > 0) {
      dst.bits.frac = (((1U << 23) | src.bits.frac) >> 14);
    } else {
      dst.bits.frac = 1;
    }
  }

  return dst.raw;
}

float   GetValueH(MKL_F16 x)
{
      return h2f( x );
} /* GetValueH */

MKL_INT GetVectorI(FILE *in_file, MKL_INT n, MKL_INT *x)
{
      return GetValuesI( in_file, x, 0, n );
} /* GetVectorI */

MKL_INT GetVectorI16(FILE *in_file, MKL_INT n, MKL_INT16 *x)
{
      return GetValuesI16( in_file, x, 1, 0, n );
} /* GetVectorI16 */

MKL_INT GetVectorI32(FILE *in_file, MKL_INT n, MKL_INT32 *x)
{
      return GetValuesI32( in_file, x, 1, 0, n );
} /* GetVectorI32 */

MKL_INT GetVectorS(FILE *in_file, MKL_INT n, float *x,  MKL_INT incx)
{
      return GetValuesS( in_file, x, 1, 0, (1+(n-1)*ABS(incx)) );
} /* GetVectorS */

MKL_INT GetVectorD(FILE *in_file, MKL_INT n, double *x,  MKL_INT incx)
{
      return GetValuesD( in_file, x, 1, 0, (1+(n-1)*ABS(incx)) );
} /* GetVectorD */

MKL_INT GetVectorC(FILE *in_file, MKL_INT n, MKL_Complex8 *x,  MKL_INT incx)
{
      return GetValuesC( in_file, x, 1, 0, (1+(n-1)*ABS(incx)) );
} /* GetVectorC */

MKL_INT GetVectorZ(FILE *in_file, MKL_INT n, MKL_Complex16 *x,  MKL_INT incx)
{
      return GetValuesZ( in_file, x, 1, 0, (1+(n-1)*ABS(incx)) );
} /* GetVectorZ */


MKL_INT GetArrayI8(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
                    MKL_INT8 *a, MKL_INT *lda)
{
      MKL_INT i, j , number;
      MKL_INT8 *addr;

      if( *layout == CblasRowMajor ) {
        if( flag == GENERAL_MATRIX ) {
           for( i = 0; i < (*m); i++ ) {
              addr = a + i*(*lda);
              number = GetValuesI8( in_file, addr, 1, 0, *n );
              if( number != *n ) return 1;
           } /* for */
        } else if( flag == UPPER_MATRIX ) {
           for (i = 0; i < (*m); i++) {
              addr = a + i*(*lda);
              number = GetValuesI8( in_file, addr, 1, i, *n-i );
              if( number != *n-i ) return 1;
           } /* for */
        } else if( flag == LOWER_MATRIX ) {
           for (i = 0; i < (*m); i++) {
              addr = a + i*(*lda);
              number = GetValuesI8( in_file, addr, 1, 0, i+1 );
              if( number != i+1 ) return 1;
           } /* for */
        } /* if */
     } else if( *layout == CblasColMajor ) {
        if( flag == GENERAL_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI8( in_file, addr, 1, 0, *m );
              if( number != *m ) return 1;
           } /* for */
        } else if( flag == UPPER_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI8( in_file, addr, 1, 0, j+1 );
              if( number != j+1 ) return 1;
           } /* for */
        } else if( flag == LOWER_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI8( in_file, addr, 1, j, *m-j );
              if( number != *m-j ) return 1;
           } /* for */
        } /* if */
     } /* if */
     return 0;
} /* GetArrayI8 */

MKL_INT GetArrayI16(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
                    MKL_INT16 *a, MKL_INT *lda)
{
      MKL_INT i, j , number;
      MKL_INT16 *addr;

      if( *layout == CblasRowMajor ) {
        if( flag == GENERAL_MATRIX ) {
           for( i = 0; i < (*m); i++ ) {
              addr = a + i*(*lda);
              number = GetValuesI16( in_file, addr, 1, 0, *n );
              if( number != *n ) return 1;
           } /* for */
        } else if( flag == UPPER_MATRIX ) {
           for (i = 0; i < (*m); i++) {
              addr = a + i*(*lda);
              number = GetValuesI16( in_file, addr, 1, i, *n-i );
              if( number != *n-i ) return 1;
           } /* for */
        } else if( flag == LOWER_MATRIX ) {
           for (i = 0; i < (*m); i++) {
              addr = a + i*(*lda);
              number = GetValuesI16( in_file, addr, 1, 0, i+1 );
              if( number != i+1 ) return 1;
           } /* for */
        } /* if */
     } else if( *layout == CblasColMajor ) {
        if( flag == GENERAL_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI16( in_file, addr, 1, 0, *m );
              if( number != *m ) return 1;
           } /* for */
        } else if( flag == UPPER_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI16( in_file, addr, 1, 0, j+1 );
              if( number != j+1 ) return 1;
           } /* for */
        } else if( flag == LOWER_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI16( in_file, addr, 1, j, *m-j );
              if( number != *m-j ) return 1;
           } /* for */
        } /* if */
     } /* if */
     return 0;
} /* GetArrayI16 */

MKL_INT GetArrayI32(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
                    MKL_INT32 *a, MKL_INT *lda)
{
      MKL_INT i, j , number;
      MKL_INT32 *addr;

      if( *layout == CblasRowMajor ) {
        if( flag == GENERAL_MATRIX ) {
           for( i = 0; i < (*m); i++ ) {
              addr = a + i*(*lda);
              number = GetValuesI32( in_file, addr, 1, 0, *n );
              if( number != *n ) return 1;
           } /* for */
        } else if( flag == UPPER_MATRIX ) {
           for (i = 0; i < (*m); i++) {
              addr = a + i*(*lda);
              number = GetValuesI32( in_file, addr, 1, i, *n-i );
              if( number != *n-i ) return 1;
           } /* for */
        } else if( flag == LOWER_MATRIX ) {
           for (i = 0; i < (*m); i++) {
              addr = a + i*(*lda);
              number = GetValuesI32( in_file, addr, 1, 0, i+1 );
              if( number != i+1 ) return 1;
           } /* for */
        } /* if */
     } else if( *layout == CblasColMajor ) {
        if( flag == GENERAL_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI32( in_file, addr, 1, 0, *m );
              if( number != *m ) return 1;
           } /* for */
        } else if( flag == UPPER_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI32( in_file, addr, 1, 0, j+1 );
              if( number != j+1 ) return 1;
           } /* for */
        } else if( flag == LOWER_MATRIX ) {
           for( j = 0; j < (*n); j++ ) {
              addr = a + j*(*lda);
              number = GetValuesI32( in_file, addr, 1, j, *m-j );
              if( number != *m-j ) return 1;
           } /* for */
        } /* if */
     } /* if */
     return 0;
} /* GetArrayI32 */

MKL_INT GetArrayBF16(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
              MKL_BF16 *a,  MKL_INT *lda)
{
      MKL_INT   i, j, number;
      MKL_BF16 *addr;

      if( *layout == CblasRowMajor ) {
         if( flag == GENERAL_MATRIX ) {
            for( i = 0; i < (*m); i++ ) {
               addr = a + i*(*lda);
               number = GetValuesBF16( in_file, addr, 1, 0, *n );
               if( number != *n ) return 1;
            } /* for */
         } else if( flag == UPPER_MATRIX ) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesBF16( in_file, addr, 1, i, *n-i );
               if( number != *n-i ) return 1;
            } /* for */
         } else if( flag == LOWER_MATRIX ) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesBF16( in_file, addr, 1, 0, i+1 );
               if( number != i+1 ) return 1;
            } /* for */
         } /* if */
      } else if( *layout == CblasColMajor ) {
         if( flag == GENERAL_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesBF16( in_file, addr, 1, 0, *m );
               if( number != *m ) return 1;
            } /* for */
         } else if( flag == UPPER_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesBF16( in_file, addr, 1, 0, j+1 );
               if( number != j+1 ) return 1;
            } /* for */
         } else if( flag == LOWER_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesBF16( in_file, addr, 1, j, *m-j );
               if( number != *m-j ) return 1;
            } /* for */
         } /* if */
      } /* if */
      return 0;
} /* GetArrayBF16 */

MKL_INT GetArrayH(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
              MKL_F16 *a,  MKL_INT *lda)
{
      MKL_INT   i, j, number;
      MKL_F16    *addr;

      if( *layout == CblasRowMajor ) {
         if( flag == GENERAL_MATRIX ) {
            for( i = 0; i < (*m); i++ ) {
               addr = a + i*(*lda);
               number = GetValuesH( in_file, addr, 1, 0, *n );
               if( number != *n ) return 1;
            } /* for */
         } else if( flag == UPPER_MATRIX ) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesH( in_file, addr, 1, i, *n-i );
               if( number != *n-i ) return 1;
            } /* for */
         } else if( flag == LOWER_MATRIX ) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesH( in_file, addr, 1, 0, i+1 );
               if( number != i+1 ) return 1;
            } /* for */
         } /* if */
      } else if( *layout == CblasColMajor ) {
         if( flag == GENERAL_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesH( in_file, addr, 1, 0, *m );
               if( number != *m ) return 1;
            } /* for */
         } else if( flag == UPPER_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesH( in_file, addr, 1, 0, j+1 );
               if( number != j+1 ) return 1;
            } /* for */
         } else if( flag == LOWER_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesH( in_file, addr, 1, j, *m-j );
               if( number != *m-j ) return 1;
            } /* for */
         } /* if */
      } /* if */
      return 0;
} /* GetArrayH */

MKL_INT GetArrayS(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
              float *a,  MKL_INT *lda)
{
      MKL_INT   i, j, number;
      float    *addr;

      if( *layout == CblasRowMajor ) {
         if( flag == GENERAL_MATRIX ) {
            for( i = 0; i < (*m); i++ ) {
               addr = a + i*(*lda);
               number = GetValuesS( in_file, addr, 1, 0, *n );
               if( number != *n ) return 1;
            } /* for */
         } else if( flag == UPPER_MATRIX ) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesS( in_file, addr, 1, i, *n-i );
               if( number != *n-i ) return 1;
            } /* for */
         } else if( flag == LOWER_MATRIX ) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesS( in_file, addr, 1, 0, i+1 );
               if( number != i+1 ) return 1;
            } /* for */
         } /* if */
      } else if( *layout == CblasColMajor ) {
         if( flag == GENERAL_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesS( in_file, addr, 1, 0, *m );
               if( number != *m ) return 1;
            } /* for */
         } else if( flag == UPPER_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesS( in_file, addr, 1, 0, j+1 );
               if( number != j+1 ) return 1;
            } /* for */
         } else if( flag == LOWER_MATRIX ) {
            for( j = 0; j < (*n); j++ ) {
               addr = a + j*(*lda);
               number = GetValuesS( in_file, addr, 1, j, *m-j );
               if( number != *m-j ) return 1;
            } /* for */
         } /* if */
      } /* if */
      return 0;
} /* GetArrayS */

MKL_INT GetArrayD(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
              double *a,  MKL_INT *lda)
{
      MKL_INT   i, j, number;
      double   *addr;

      if (*layout == CblasRowMajor) {
         if (flag == GENERAL_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesD( in_file, addr, 1, 0, *n );
               if( number != *n ) return 1;
            } /* for */
         } else if (flag == UPPER_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesD( in_file, addr, 1, i, *n-i );
               if( number != *n-i ) return 1;
            } /* for */
         } else if (flag == LOWER_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesD( in_file, addr, 1, 0, i+1 );
               if( number != i+1 ) return 1;
            } /* for */
         } /* if */
      } else if (*layout == CblasColMajor) {
         if (flag == GENERAL_MATRIX) {
             for (j = 0; j < (*n); j++) {
                addr = a + j*(*lda);
                number = GetValuesD( in_file, addr, 1, 0, *m );
                if( number != *m ) return 1;
             } /* for */
         } else if (flag == UPPER_MATRIX) {
             for (j = 0; j < (*n); j++) {
                 addr = a + j*(*lda);
                 number = GetValuesD( in_file, addr, 1, 0, j+1 );
                 if( number != j+1 ) return 1;
             } /* for */
         } else if (flag == LOWER_MATRIX) {
             for (j = 0; j < (*n); j++) {
                 addr = a + j*(*lda);
                 number = GetValuesD( in_file, addr, 1, j, *m-j );
                 if( number != *m-j ) return 1;
             } /* for */
         } /* if */
      } /* if */
      return 0;
} /* GetArrayD */

MKL_INT GetArrayC(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag, MKL_INT *m, MKL_INT *n,
              MKL_Complex8 *a,  MKL_INT *lda)
{
      MKL_INT        i, j, number;
      MKL_Complex8  *addr;

      if (*layout == CblasRowMajor) {
         if (flag == GENERAL_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesC( in_file, addr, 1, 0, *n );
               if( number != *n ) return 1;
            } /* for */
         } else if (flag == UPPER_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesC( in_file, addr, 1, i, *n-i );
               if( number != *n-i ) return 1;
            } /* for */
         } else if (flag == LOWER_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesC( in_file, addr, 1, 0, i+1 );
               if( number != i+1 ) return 1;
            } /* for */
         } /* if */
      } else if (*layout == CblasColMajor) {
         if (flag == GENERAL_MATRIX) {
            for (j = 0; j < (*n); j++) {
               addr = a + j*(*lda);
               number = GetValuesC( in_file, addr, 1, 0, *m );
               if( number != *m ) return 1;
            } /* for */
         } else if (flag == UPPER_MATRIX) {
            for (j = 0; j < (*n); j++) {
               addr = a + j*(*lda);
               number = GetValuesC( in_file, addr, 1, 0, j+1 );
               if( number != j+1 ) return 1;
            } /* for */
         } else if (flag == LOWER_MATRIX) {
            for (j = 0; j < (*n); j++) {
               addr = a + j*(*lda);
               number = GetValuesC( in_file, addr, 1, j, *m-j );
               if( number != *m-j ) return 1;
            } /* for */
         } /* if */
      } /* if */
      return 0;
} /* GetArrayC */

MKL_INT GetArrayZ(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT flag,
              MKL_INT *m, MKL_INT *n, MKL_Complex16 *a,  MKL_INT *lda)
{
      MKL_INT        i, j, number;
      MKL_Complex16 *addr;

      if (*layout == CblasRowMajor) {
         if (flag == GENERAL_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesZ( in_file, addr, 1, 0, *n );
               if( number != *n ) return 1;
            } /* for */
         } else if (flag == UPPER_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesZ( in_file, addr, 1, i, *n-i );
               if( number != *n-i ) return 1;
            } /* for */
         } else if (flag == LOWER_MATRIX) {
            for (i = 0; i < (*m); i++) {
               addr = a + i*(*lda);
               number = GetValuesZ( in_file, addr, 1, 0, i+1 );
               if( number != i+1 ) return 1;
            } /* for */
         } /* if */
      } else if(*layout == CblasColMajor) {
         if (flag == GENERAL_MATRIX) {
            for (j = 0; j < (*n); j++) {
               addr = a + j*(*lda);
               number = GetValuesZ( in_file, addr, 1, 0, *m );
               if( number != *m ) return 1;
            } /* for */
         } else if (flag == UPPER_MATRIX) {
            for (j = 0; j < (*n); j++) {
               addr = a + j*(*lda);
               number = GetValuesZ( in_file, addr, 1, 0, j+1 );
               if( number != j+1 ) return 1;
            } /* for */
         } else if (flag == LOWER_MATRIX) {
            for (j = 0; j < (*n); j++) {
               addr = a + j*(*lda);
               number = GetValuesZ( in_file, addr, 1, j, *m-j );
               if( number != *m-j ) return 1;
            } /* for */
         } /* if */
      } /* if */
      return 0;
} /* GetArrayZ */

MKL_INT GetBandArrayS(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT kl, MKL_INT ku,
                  MKL_INT m, MKL_INT n, float *a,  MKL_INT lda)
{
      MKL_INT  i, j;
      MKL_INT  kl1, ku1, i_start, j_start, j_end, ku_rows, kl_rows, number;
      float   *addr, *addr1;

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            number = GetValuesS( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            number = GetValuesS( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              j = j_start*lda; addr1 = a+i+i_start;
              number = GetValuesS( in_file, addr1, lda, j, n-ku_rows );
              if( number != n-ku_rows ) return 1;
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a + ku;
         number = GetValuesS( in_file, addr1, lda, 0, j_end );
         if( number != j_end ) return 1;

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a + ku + i;
              number = GetValuesS( in_file, addr1, lda, 0, kl1 );
              if( number != kl1 ) return 1;
         }
      }
      return 0;
} /* GetBandArrayS */

MKL_INT GetBandArrayD(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT kl, MKL_INT ku,
                  MKL_INT m, MKL_INT n, double *a,  MKL_INT lda)
{
      MKL_INT     i, j;
      MKL_INT     kl1, ku1, i_start, j_start, j_end, kl_rows, ku_rows, number;
      double     *addr, *addr1;

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            number = GetValuesD( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            number = GetValuesD( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              j = j_start*lda; addr1 = a+i+i_start;
              number = GetValuesD( in_file, addr1, lda, j, n-ku_rows );
              if( number != n-ku_rows ) return 1;
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a+ku;
         number = GetValuesD( in_file, addr1, lda, 0, j_end );
         if( number != j_end ) return 1;

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a+ku+i;
              number = GetValuesD( in_file, addr1, lda, 0, kl1 );
              if( number != kl1 ) return 1;
         }
      } /* if */
      return 0;
} /* GetBandArrayD */

MKL_INT GetBandArrayC(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT kl, MKL_INT ku,
                  MKL_INT m, MKL_INT n, MKL_Complex8 *a,  MKL_INT lda)
{
      MKL_INT        i, j;
      MKL_INT        kl1, ku1, i_start, j_start, j_end, ku_rows, kl_rows, number;
      MKL_Complex8  *addr, *addr1;

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            number = GetValuesC( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            number = GetValuesC( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              j = j_start*lda; addr1 = a+i+i_start;
              number = GetValuesC( in_file, addr1, lda, j, n-ku_rows );
              if( number != n-ku_rows ) return 1;
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a+ku;
         number = GetValuesC( in_file, addr1, lda, 0, j_end );
         if( number != j_end ) return 1;

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a+ku+i;
              number = GetValuesC( in_file, addr1, lda, 0, kl1 );
              if( number != kl1 ) return 1;
         }
      } /* if */
      return 0;
} /* GetBandArrayC */

MKL_INT GetBandArrayZ(FILE *in_file, CBLAS_LAYOUT*layout, MKL_INT kl, MKL_INT ku,
                  MKL_INT m, MKL_INT n, MKL_Complex16 *a,  MKL_INT lda)
{
      MKL_INT        i, j;
      MKL_INT        kl1, ku1, i_start, j_start, j_end, ku_rows, kl_rows, number;
      MKL_Complex16 *addr, *addr1;

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            number = GetValuesZ( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            number = GetValuesZ( in_file, addr1, 1, 0, j_end-j_start+1 );
            if( number != j_end-j_start+1 ) return 1;
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              j = j_start*lda; addr1 = a+i+i_start;
              number = GetValuesZ( in_file, addr1, lda, j, n-ku_rows );
              if( number != n-ku_rows ) return 1;
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a+ku;
         number = GetValuesZ( in_file, addr1, lda, 0, j_end );
         if( number != j_end ) return 1;

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a+ku+i;
              number = GetValuesZ( in_file, addr1, lda, 0, kl1 );
              if( number != kl1 ) return 1;
         }
      } /* if */
      return 0;
} /* GetBandArrayZ */

MKL_INT GetValuesI( FILE *in_file, MKL_INT *in_array, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    int     value;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%d", &value) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i]=(MKL_INT)value;
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
}

MKL_INT GetValuesI8( FILE *in_file, MKL_INT8 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    int value_int;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%d", &value_int) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=(MKL_INT8)value_int;
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
}



MKL_INT GetValuesI16( FILE *in_file, MKL_INT16 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    MKL_INT16     value;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%hd", &value) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=(MKL_INT16)value;
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
}

MKL_INT GetValuesI32( FILE *in_file, MKL_INT32 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    MKL_INT32     value;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%d", &value) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=(MKL_INT32)value;
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
}

MKL_INT GetValuesBF16( FILE *in_file, MKL_BF16 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    float   temp;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%f", &temp) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=f2b(temp);
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
} /* GetValuesBF16 */

MKL_INT GetValuesH( FILE *in_file, MKL_F16 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    float   temp;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%f", &temp) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=f2h(temp);
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
} /* GetValuesH */

MKL_INT GetValuesD( FILE *in_file, double *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    double  temp;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%lf", &temp) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=temp;
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
} /* GetValuesD */

MKL_INT GetValuesS( FILE *in_file, float *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    float   temp;
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%f", &temp) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld]=temp;
       counter++;
       if ( (str = strtok( NULL, " " )) == NULL  ) break;
    }
    return counter;
} /* GetValuesS */

MKL_INT GetValuesC( FILE *in_file, MKL_Complex8 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    float   re, im;
    char    seps[]=" ,()\t";
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, seps );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%f", &re) != 1 || (str = strtok( NULL,seps)) == NULL ||
           sscanf( str, "%f", &im) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld].real=re;
       in_array[begin+i*ld].imag=im;
       counter++;
       if ( (str = strtok( NULL, seps )) == NULL  ) break;
    }
    return counter;
} /* GetValuesC */

MKL_INT GetValuesZ( FILE *in_file, MKL_Complex16 *in_array, MKL_INT ld, MKL_INT begin, MKL_INT max_numbers )
{
    MKL_INT i, counter=0;
    double  re, im;
    char    seps[]=" ,()\t";
    char    buf[MAX_STRING_LEN], *str;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, seps );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    for( i = 0; i < max_numbers; i++ ) {
       if ( *str==COMMENTS ) break;
       if( sscanf( str, "%lf", &re) != 1 || (str = strtok( NULL,seps))== NULL ||
           sscanf( str, "%lf", &im) != 1 ){
           printf( "\n File format is inappropriate\n" );
           return 0;
       }
       in_array[begin+i*ld].real=re;
       in_array[begin+i*ld].imag=im;
       counter++;
       if ( (str = strtok( NULL, seps )) == NULL  ) break;
    }
    return counter;
} /* GetValuesZ */

int GetIntegerParameters8(FILE *in_file, ...)
{
    int       counter=0;
    char      buf[MAX_STRING_LEN], *str;
    MKL_INT8 *param;
    va_list   ap;
    int      value;

    do {
       fgets(buf, MAX_STRING_LEN, in_file);
       str = strtok( buf, " " );
       if( str == NULL ){
           printf("\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start(ap, in_file);
    param = va_arg(ap, MKL_INT8 *);

	while ( 1 ) {
        if( *str == COMMENTS ) break;
        if( sscanf(str, "%d", &value ) != 1 ){
            printf("\n File format is inappropriate\n");
            va_end( ap );
            return 0;
        }
        *param = (MKL_INT8)value;
        counter++;
        if( (str = strtok( NULL, " " )) == NULL ) break;
        param = va_arg( ap, MKL_INT8 * );
    }
    va_end( ap );
    return counter;
} /* GetIntegerParameters16 */

int GetIntegerParameters16(FILE *in_file, ...)
{
    int       counter=0;
    char      buf[MAX_STRING_LEN], *str;
    MKL_INT16 *param;
    va_list   ap;
    MKL_INT16      value;

    do {
       fgets(buf, MAX_STRING_LEN, in_file);
       str = strtok( buf, " " );
       if( str == NULL ){
           printf("\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start(ap, in_file);
    param = va_arg(ap, MKL_INT16 *);

	while ( 1 ) {
        if( *str == COMMENTS ) break;
        if( sscanf(str, "%hd", &value ) != 1 ){
            printf("\n File format is inappropriate\n");
            va_end( ap );
            return 0;
        }
        *param = (MKL_INT16)value;
        counter++;
        if( (str = strtok( NULL, " " )) == NULL ) break;
        param = va_arg( ap, MKL_INT16 * );
    }
    va_end( ap );
    return counter;
} /* GetIntegerParameters16 */

int GetIntegerParameters32(FILE *in_file, ...)
{
    int       counter=0;
    char      buf[MAX_STRING_LEN], *str;
    MKL_INT32 *param;
    va_list   ap;
    MKL_INT32      value;

    do {
       fgets(buf, MAX_STRING_LEN, in_file);
       str = strtok( buf, " " );
       if( str == NULL ){
           printf("\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start(ap, in_file);
    param = va_arg(ap, MKL_INT32 *);

	while ( 1 ) {
        if( *str == COMMENTS ) break;
        if( sscanf(str, "%d", &value ) != 1 ){
            printf("\n File format is inappropriate\n");
            va_end( ap );
            return 0;
        }
        *param = (MKL_INT32)value;
        counter++;
        if( (str = strtok( NULL, " " )) == NULL ) break;
        param = va_arg( ap, MKL_INT32 * );
    }
    va_end( ap );
    return counter;
} /* GetIntegerParameters32 */

int GetIntegerParameters(FILE *in_file, ...)
{
    int      counter=0;
    char     buf[MAX_STRING_LEN], *str;
    MKL_INT *param;
    va_list  ap;
    int      value;

    do {
       fgets(buf, MAX_STRING_LEN, in_file);
       str = strtok( buf, " " );
       if( str == NULL ){
           printf("\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start(ap, in_file);
    param = va_arg(ap, MKL_INT *);

	while ( 1 ) {
        if( *str == COMMENTS ) break;
        if( sscanf(str, "%d", &value ) != 1 ){
            printf("\n File format is inappropriate\n");
            va_end( ap );
            return 0;
        }
        *param = (MKL_INT)value;
        counter++;
        if( (str = strtok( NULL, " " )) == NULL ) break;
        param = va_arg( ap, MKL_INT * );
    }
    va_end( ap );
    return counter;
} /* GetIntegerParameters */

int GetCblasCharParameters(FILE *in_file, ...)
{
    int      counter=0;
    char     buf[MAX_STRING_LEN], *str;
    int     *param;
    va_list  ap;

    do {
       fgets(buf, MAX_STRING_LEN, in_file);
       str = strtok( buf, " " );
       if( str == NULL ){
           printf("\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start(ap, in_file);
    param = va_arg(ap, int *);

	while ( 1 ) {
        if( *str == COMMENTS ) break;
        if( sscanf(str, "%d", param ) != 1 ){
            printf("\n File format is inappropriate\n");
            va_end( ap );
            return 0;
        }
        counter++;
        if( (str = strtok( NULL, " " )) == NULL ) break;
        param = va_arg( ap, int * );
    }
    va_end( ap );
    return counter;
} /* GetCblasCharParameters */

int GetScalarsH( FILE *in_file, ... )
{
    int     counter=0;
    char    buf[MAX_STRING_LEN], *str;
    MKL_F16 *param;
    va_list ap;
    float   t;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start( ap, in_file );
    param = va_arg( ap, MKL_F16 * );
    while ( 1 ) {
        if ( *str == COMMENTS )  break;
        if( sscanf( str, "%f", &t ) != 1 ){
            printf( "\n File format is inappropriate\n" );
            va_end( ap );
            return 0;
        }
	*param = f2h(t);
        counter++;
        if ( (str = strtok( NULL, " " )) == NULL  ) break;
        param = va_arg( ap, MKL_F16 * );
    }
    va_end( ap );
    return counter;
} /* GetScalarsH */

int GetScalarsD( FILE *in_file, ... )
{
    int     counter=0;
    char    buf[MAX_STRING_LEN], *str;
    double  *param;
    va_list ap;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start( ap, in_file );
    param = va_arg( ap, double * );
    while ( 1 ) {
        if ( *str == COMMENTS ) break;
        if( sscanf( str, "%lf", param ) != 1 ){
            printf( "\n File format is inappropriate\n" );
            va_end( ap );
            return 0;
        }
        counter++;
        if ( (str = strtok( NULL, " " )) == NULL  ) break;
        param = va_arg( ap, double * );
    }
    va_end( ap );
    return counter;
} /* GetScalarsD */

int GetScalarsS( FILE *in_file, ... )
{
    int     counter=0;
    char    buf[MAX_STRING_LEN], *str;
    float  *param;
    va_list ap;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, " " );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start( ap, in_file );
    param = va_arg( ap, float * );
    while ( 1 ) {
        if ( *str == COMMENTS )  break;
        if( sscanf( str, "%f", param ) != 1 ){
            printf( "\n File format is inappropriate\n" );
            va_end( ap );
            return 0;
        }
        counter++;
        if ( (str = strtok( NULL, " " )) == NULL  ) break;
        param = va_arg( ap, float * );
    }
    va_end( ap );
    return counter;
} /* GetScalarsS */

int GetScalarsC( FILE *in_file, ... )
{
    int           counter=0;
    char          buf[MAX_STRING_LEN], *str;
    char          seps[]=" ,()\t";
    float         re = 0.f, im = 0.f;
    MKL_Complex8 *param;
    va_list       ap;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, seps );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start( ap, in_file );
    param = va_arg( ap, MKL_Complex8 * );
    while ( 1 ) {
        if ( *str == COMMENTS )  break;
        if( sscanf( str, "%f", &re) != 1 || (str = strtok( NULL,seps))== NULL ||
            sscanf( str, "%f", &im) != 1 ){
            printf( "\n File format is inappropriate %f %f\n", re, im );
            va_end( ap );
            return 0;
        }
        param->real = re;
        param->imag = im;
        counter++;
        if ( (str = strtok( NULL, seps )) == NULL  )  break;
        param = va_arg( ap, MKL_Complex8 * );
    }
    va_end( ap );
    return counter;
} /* GetScalarsC */

int GetScalarsZ( FILE *in_file, ... )
{
    int            counter=0;
    char           buf[MAX_STRING_LEN], *str;
    char           seps[]=" ,()\t";
    double         re = 0., im = 0.;
    MKL_Complex16 *param;
    va_list        ap;

    do {
       fgets( buf, MAX_STRING_LEN, in_file );
       str = strtok( buf, seps );
       if( str == NULL ){
           printf( "\n File format is inappropriate\n");
           return 0;
       }
    } while ( *str == COMMENTS );
    va_start( ap, in_file );
    param = va_arg( ap, MKL_Complex16 * );
    while ( 1 ) {
        if ( *str == COMMENTS ) break;
        if( sscanf( str, "%lf", &re) != 1 || (str = strtok( NULL,seps)) == NULL ||
            sscanf( str, "%lf", &im) != 1 ){
            printf( "\n File format is inappropriate %f %f\n", re, im );
            va_end( ap );
            return 0;
        }
        param->real = re;
        param->imag = im;
        counter++;
        if ( (str = strtok( NULL, seps )) == NULL  ) break;
        param = va_arg( ap, MKL_Complex16 * );
    }
    va_end( ap );
    return counter;
} /* GetScalarsZ */

void PrintParameters( char *names, ... )
{
    char            *p, *str, str1[MAX_STRING_LEN], buf[MAX_STRING_LEN];
    va_list          ap;
    char             seps[]=" ,";
    MKL_INT          itmp;
    MKL_UINT         i;

    printf("\n       ");
    va_start( ap, names );
    strcpy(buf, names);
    str = strtok( buf, seps );
    if( str == NULL ){
        printf( "\n You must determine the parameters names\n");
        return;
    }
    do {
       strcpy(str1, str);
       p = str1;
       for( i = 0; i < strlen(str1); i++ ) {
          *p = (char) tolower(str1[i]);
          p++;
       }
       if ( strcmp( str1, "layout" ) == 0 ) {
           itmp = va_arg( ap, CBLAS_LAYOUT );
           if ( itmp == CblasRowMajor )
                printf("LAYOUT = CblasRowMajor  ");
           else if ( itmp == CblasColMajor )
                printf("LAYOUT = CblasColMajor  ");
       } else if ( strcmp( str1, "side" ) == 0 ) {
            itmp = va_arg( ap, CBLAS_SIDE);
            if ( itmp == CblasLeft )
               printf("SIDE = CblasLeft  ");
            else if ( itmp == CblasRight )
               printf("SIDE = CblasRight  ");
       } else if ( strcmp( str1, "uplo" ) == 0 ) {
            itmp = va_arg( ap, CBLAS_UPLO );
            if ( itmp == CblasUpper )
               printf("UPLO = CblasUpper  ");
            else if ( itmp == CblasLower )
               printf("UPLO = CblasLower  ");
       } else if ( strcmp( str1, "diag" ) == 0 ) {
            itmp = va_arg( ap, CBLAS_DIAG );
            if ( itmp == CblasUnit )
               printf("DIAG=CblasUnit  ");
            else if ( itmp == CblasNonUnit )
               printf("DIAG = CblasNonUnit  ");
       } else if ( ( strcmp( str1, "trans"  ) == 0 ) ||
                   ( strcmp( str1, "transa" ) == 0 ) ||
                   ( strcmp( str1, "transb" ) == 0 ) ) {
            itmp = va_arg( ap, CBLAS_TRANSPOSE );
            if ( itmp == CblasNoTrans )
               printf("%s = CblasNoTrans  ", str);
            else if ( itmp == CblasTrans )
               printf("%s = CblasTrans  ", str);
            else if ( itmp == CblasConjTrans )
               printf("%s = CblasConjTrans  ", str);
       } else if ( strcmp( str1, "offsetc" ) == 0 ) {
           itmp = va_arg( ap, CBLAS_OFFSET );
           if ( itmp == CblasFixOffset ) {
               printf("OFFSETC = CblasFixOffset ");
           } else if ( itmp == CblasRowOffset ) {
               printf("OFFSETC = CblasRowOffset ");
           } else if ( itmp == CblasColOffset ) {
               printf("OFFSETC = CblasColOffset ");
           }
       }
    } while ( (str = strtok( NULL, seps )) != NULL );
    va_end( ap );
    return;
} /* PrintParameters */

void PrintVectorI(MKL_INT n, MKL_INT *x, char *name)
{
      MKL_INT i;

      printf("\n       VECTOR %s\n         ", name);
      for (i = 0; i < n; i++)
          printf("    "INT_FORMAT, *(x+i));
      return;
} /* PrintVectorI */

void PrintVectorI16(MKL_INT n, MKL_INT16 *x, char *name)
{
      MKL_INT i;

      printf("\n       VECTOR %s\n         ", name);
      for (i = 0; i < n; i++)
          printf("    %d", *(x+i));
      return;
} /* PrintVectorI16 */

void PrintVectorI32(MKL_INT n, MKL_INT32 *x, char *name)
{
      MKL_INT i;

      printf("\n       VECTOR %s\n         ", name);
      for (i = 0; i < n; i++)
          printf("    %d", *(x+i));
      return;
} /* PrintVectorI32 */

void PrintVectorS(int flag, MKL_INT n, float *x,  MKL_INT incx, char *name)
{
      MKL_INT i;

      switch(flag) {
      case 0:
           printf("\n       VECTOR %s   INC%s=" INT_FORMAT "\n         ",
                    name, name, incx);
           break;
      case 1:
           printf("\n       VECTOR %s\n         ", name);
           break;
      default:
           break;
      } /* switch */
      for (i = 0; i < (1+(n-1)*ABS(incx)); i++)
          printf("%6.2f  ", *(x+i));
      return;
} /* PrintVectorS */

void PrintVectorD(int flag, MKL_INT n, double *x,  MKL_INT incx, char *name)
{
      MKL_INT i;

      switch(flag) {
      case 0:
           printf("\n       VECTOR %s   INC%s=" INT_FORMAT "\n         ",
                    name, name, incx);
           break;
      case 1:
           printf("\n       VECTOR %s\n         ", name);
           break;
      default:
           break;
      } /* switch */
      for (i = 0; i < (1+(n-1)*ABS(incx)); i++)
          printf("%8.3f  ", *(x+i));
      return;
} /* PrintVectorD */

void PrintVectorC(int flag, MKL_INT n, MKL_Complex8 *x,  MKL_INT incx, char *name)
{
      MKL_INT i;

      switch(flag) {
      case 0:
           printf("\n       VECTOR %s   INC%s=" INT_FORMAT "\n         ",
                    name, name, incx);
           break;
      case 1:
           printf("\n       VECTOR %s\n         ", name);
           break;
      default:
           break;
      } /* switch */
      for (i = 0; i < (1+(n-1)*ABS(incx)); i++)
          printf("(%6.2f,%6.2f)   ", (x+i)->real, (x+i)->imag);
      return;
} /* PrintVectorC */

void PrintVectorZ(int flag, MKL_INT n, MKL_Complex16 *x,  MKL_INT incx, char *name)
{
      MKL_INT i;

      switch(flag) {
      case 0:
           printf("\n       VECTOR %s   INC%s=" INT_FORMAT "\n         ",
                    name, name, incx);
               break;
      case 1:
           printf("\n       VECTOR %s\n         ", name);
               break;
      default:
               break;
      } /* switch */
      for (i = 0; i < (1+(n-1)*ABS(incx)); i++)
          printf("(%6.2f,%6.2f)   ", (x+i)->real, (x+i)->imag);
      return;
} /* PrintVectorZ */

void PrintArrayI8(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, MKL_INT8 *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        MKL_INT8 *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%d  ", (int) *(addr+j));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%d  ", (int) *(addr+j));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%d  ", (int) *(addr+j));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%d  ", (int) *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%d  ", (int)*(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%d  ", (int) *(addr+j*(*lda)));
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayI8 */

void PrintArrayI16(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, MKL_INT16 *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        MKL_INT16 *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%hd  ", *(addr+j));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%hd  ", *(addr+j));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%hd  ", *(addr+j));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%hd  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%hd  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%hd  ", *(addr+j*(*lda)));
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayI16 */

void PrintArrayI32(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, MKL_INT32 *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        MKL_INT32 *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%d  ", *(addr+j));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%d  ", *(addr+j));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%d  ", *(addr+j));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%d  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%d  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%d  ", *(addr+j*(*lda)));
                } /* for */
            } /* if */
	} /* if */
	return;
} /* PrintArrayI32 */

void PrintArrayBF16(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, MKL_BF16 *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        MKL_BF16 *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%6.2f  ", b2f(*(addr+j)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%6.2f  ", b2f(*(addr+j)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%6.2f  ", b2f(*(addr+j)));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%6.2f  ", b2f(*(addr+j*(*lda))));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%6.2f  ", b2f(*(addr+j*(*lda))));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%6.2f  ", b2f(*(addr+j*(*lda))));
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayBF16 */

void PrintArrayH(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, MKL_F16 *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        MKL_F16 *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%6.2f  ", h2f(*(addr+j)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%6.2f  ", h2f(*(addr+j)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%6.2f  ", h2f(*(addr+j)));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%6.2f  ", h2f(*(addr+j*(*lda))));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%6.2f  ", h2f(*(addr+j*(*lda))));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%6.2f  ", h2f(*(addr+j*(*lda))));
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayH */

void PrintArrayS(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, float *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        float   *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%6.2f  ", *(addr+j));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%6.2f  ", *(addr+j));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%6.2f  ", *(addr+j));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%6.2f  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("        ");
                    for (j = i; j < *n; j++)
                        printf("%6.2f  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < (*m); i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%6.2f  ", *(addr+j*(*lda)));
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayS */

void PrintArrayD(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, double *a,
                MKL_INT *lda, char *name)
{
        MKL_INT i, j;
        double  *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT,
                    name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("%8.3f  ", *(addr+j));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("          ");
                    for (j = i; j < *n; j++)
                        printf("%8.3f  ", *(addr+j));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("%8.3f  ", *(addr+j));
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("%8.3f  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("          ");
                    for (j = i; j < *n; j++)
                        printf("%8.3f  ", *(addr+j*(*lda)));
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("%8.3f  ", *(addr+j*(*lda)));
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayD */

void PrintArrayC(CBLAS_LAYOUT*layout, int flag1, int flag2, MKL_INT *m, MKL_INT *n, MKL_Complex8 *a,
                MKL_INT *lda, char *name)
{
        MKL_INT  i, j;
        MKL_Complex8 *addr;

        switch(flag1) {
        case 0:
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT,
                    name, name, *lda);
            break;
        case 1:
            printf("\n       ARRAY %s", name);
            break;
        default:
            break;
        } /* switch */
        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j)->real, (addr+j)->imag);
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("                  ");
                    for (j = i; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j)->real, (addr+j)->imag);
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j)->real, (addr+j)->imag);
                } /* for */
            } /* if */
        } else if(*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j*(*lda))->real,
                                (addr+j*(*lda))->imag);
                }  /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("                  ");
                    for (j = i; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j*(*lda))->real,
                                (addr+j*(*lda))->imag);
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j*(*lda))->real,
                                (addr+j*(*lda))->imag);
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayC */

void PrintArrayZ(CBLAS_LAYOUT*layout, int flag1, int flag2,
                 MKL_INT *m, MKL_INT *n, MKL_Complex16 *a, MKL_INT *lda, char *name)
{
        MKL_INT   i, j;
        MKL_Complex16 *addr;

        if (flag1 == 0)
            printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, *lda);
        else
            printf("\n       ARRAY %s", name);

        if (*layout == CblasRowMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j)->real,
                               (addr+j)->imag);
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j=0; j < i; j++)
                        printf("                  ");
                    for (j = i; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j)->real,
                               (addr+j)->imag);
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i*(*lda);
                    for (j = 0; j <= i; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j)->real,
                                (addr+j)->imag);
                } /* for */
            } /* if */
        } else if (*layout == CblasColMajor) {
            if (flag2 == GENERAL_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j*(*lda))->real,
                                (addr+j*(*lda))->imag);
                } /* for */
            } else if (flag2 == UPPER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j=0; j < i; j++)
                        printf("                  ");
                    for (j = i; j < *n; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j*(*lda))->real,
                                (addr+j*(*lda))->imag);
                } /* for */
            } else if (flag2 == LOWER_MATRIX) {
                for (i = 0; i < *m; i++) {
                    printf("\n         ");
                    addr = a + i;
                    for (j = 0; j <= i; j++)
                        printf("(%6.2f,%6.2f)   ", (addr+j*(*lda))->real,
                                (addr+j*(*lda))->imag);
                } /* for */
            } /* if */
        } /* if */
        return;
} /* PrintArrayZ */

void PrintBandArrayS(CBLAS_LAYOUT*layout, int flag,
                     MKL_INT kl, MKL_INT ku, MKL_INT m, MKL_INT n,
                     float *a, MKL_INT lda, char *name)
{
      MKL_INT    i, j, l;
      float *addr, *addr1;
      MKL_INT    kl1, ku1, i_start, j_start, j_end, kl_rows, ku_rows;

      if (flag == 0)
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT "  KL=" INT_FORMAT "  KU=" INT_FORMAT,
                   name, name, lda, kl, ku);
      else
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, lda);

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("        ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("%6.2f  ", *(addr1+l));
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("        ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("%6.2f  ", *(addr1+l));
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              printf("\n      ");
              j = j_start*lda; addr1 = a + i + i_start;
              for( l=0; l < j_start; l++ )
                 printf("        ");
              for( l=0; l < n-ku_rows; l++ )
                 printf("%6.2f  ", *(addr1+j+l*lda) );
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a + ku;
         printf("\n      ");
         for( l=0; l < j_end; l++ )
            printf("%6.2f  ", *(addr1+l*lda) );

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              printf("\n      ");
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a + ku + i;
              for( l=0; l < kl1; l++ )
                 printf("%6.2f  ", *(addr1+l*lda) );
         }
      } /* if */
      return;
} /* PrintBandArrayS */

void PrintBandArrayD(CBLAS_LAYOUT*layout, int flag,
                     MKL_INT kl, MKL_INT ku, MKL_INT m, MKL_INT n,
                     double *a, MKL_INT lda, char *name)
{
      MKL_INT i, j, l;
      double *addr, *addr1;
      MKL_INT kl1, ku1, i_start, j_start, j_end, kl_rows, ku_rows;

      if (flag == 0)
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT "  KL=" INT_FORMAT "  KU=" INT_FORMAT,
                   name, name, lda, kl, ku);
      else
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, lda);

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("        ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("%6.2f  ", *(addr1+l));
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("        ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("%6.2f  ", *(addr1+l));
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              printf("\n      ");
              j = j_start*lda; addr1 = a + i + i_start;
              for( l=0; l < j_start; l++ )
                 printf("        ");
              for( l=0; l < n-ku_rows; l++ )
                 printf("%6.2f  ", *(addr1+j+l*lda) );
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a + ku;
         printf("\n      ");
         for( l=0; l < j_end; l++ )
            printf("%6.2f  ", *(addr1+l*lda) );

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              printf("\n      ");
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a + ku + i;
              for( l=0; l < kl1; l++ )
                 printf("%6.2f  ", *(addr1+l*lda) );
         }
      } /* if */
      return;
} /* PrintBandArrayD */

void PrintBandArrayC(CBLAS_LAYOUT*layout, int flag,
                     MKL_INT kl, MKL_INT ku, MKL_INT m, MKL_INT n,
                     MKL_Complex8 *a, MKL_INT lda, char *name)
{
      MKL_INT       i, j, l;
      MKL_Complex8 *addr, *addr1;
      MKL_INT       kl1, ku1, j_start, j_end, i_start, kl_rows, ku_rows;

      if (flag == 0)
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT "  KL=" INT_FORMAT "  KU=" INT_FORMAT,
                   name, name, lda, kl, ku);
      else
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, lda);

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("                 ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("(%6.2f,%6.2f)  ", (addr1+l)->real, (addr1+l)->imag);
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("                 ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("(%6.2f,%6.2f)  ", (addr1+l)->real, (addr1+l)->imag);
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              printf("\n               ");
              j = j_start*lda; addr1 = a + i + i_start;
              for( l=0; l < j_start; l++ )
                 printf("                 ");
              for( l=0; l < n-ku_rows; l++ )
                 printf("(%6.2f,%6.2f)  ",
                         (addr1+j+l*lda)->real, (addr1+j+l*lda)->imag );
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a + ku;
         printf("\n               ");
         for( l=0; l < j_end; l++ )
            printf("(%6.2f,%6.2f)  ",
                    (addr1+l*lda)->real, (addr1+l*lda)->imag );

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              printf("\n               ");
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a + ku + i;
              for( l=0; l < kl1; l++ )
                 printf("(%6.2f,%6.2f)  ",
                         (addr1+l*lda)->real, (addr1+l*lda)->imag );
         }
      } /* if */
      return;
} /* PrintBandArrayC */

void PrintBandArrayZ(CBLAS_LAYOUT*layout, int flag,
                     MKL_INT kl, MKL_INT ku, MKL_INT m, MKL_INT n,
                     MKL_Complex16 *a, MKL_INT lda, char *name)
{
      MKL_INT        i, j, l;
      MKL_Complex16 *addr, *addr1;
      MKL_INT        kl1, ku1, j_start, j_end, i_start, kl_rows, ku_rows;

      if (flag == 0)
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT "  KL=" INT_FORMAT "  KU=" INT_FORMAT,
                   name, name, lda, kl, ku);
      else
          printf("\n       ARRAY %s   LD%s=" INT_FORMAT, name, name, lda);

      if (*layout == CblasRowMajor) {
         for( i = 0; i < MIN( m, n ); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = ( i - kl <= 0 ) ? i : kl;
            ku1  = ( i + ku >= n ) ? MAX(0,n-i-1) : ku;
            j_start = kl - kl1;
            j_end = j_start + kl1 + ku1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("                 ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("(%6.2f,%6.2f)  ", (addr1+l)->real, (addr1+l)->imag);
         }
         for( i = MIN( m, n ); i < MIN( m, MIN( m, n ) + kl); i++ ) {
            printf("\n    ");
            addr = a + i*lda;
            kl1  = n - i + kl;
            j_start = ( kl > n ) ? kl - n : n - kl;
            j_end = j_start + kl1 - 1;
            addr1 = addr + j_start;
            for( l=0; l < j_start; l++ )
                printf("                 ");
            for( l=0; l < j_end-j_start+1; l++ )
                printf("(%6.2f,%6.2f)  ", (addr1+l)->real, (addr1+l)->imag);
         }
      } else if (*layout == CblasColMajor) {
         i_start = (ku > n ) ? ku - n + 1 : 0;
         ku_rows = (ku > n ) ? n - 1 : ku;
         j_start = ku_rows;
         for( i = 0; i < ku_rows; i++ ) {
              printf("\n      ");
              j = j_start*lda; addr1 = a + i + i_start;
              for( l=0; l < j_start; l++ )
                 printf("                 ");
              for( l=0; l < n-ku_rows; l++ )
                 printf("(%6.2f,%6.2f)  ",
                         (addr1+j+l*lda)->real, (addr1+j+l*lda)->imag );
              j_start--;
         }

         j_end = MIN(m,n);
         addr1 = a + ku;
          printf("\n      ");
         for( l=0; l < j_end; l++ )
            printf("(%6.2f,%6.2f)  ",
                    (addr1+l*lda)->real, (addr1+l*lda)->imag );

         kl_rows = ( kl <= m-1 ) ?  kl : m - 1;
         for ( i = 1; i < kl_rows+1; i++ ) {
              printf("\n      ");
              kl1 = ( i+j_end <= m ) ? j_end : m - i;
              addr1 = a + ku + i;
              for( l=0; l < kl1; l++ )
                 printf("(%6.2f,%6.2f)  ",
                         (addr1+l*lda)->real, (addr1+l*lda)->imag );
         }
      } /* if */
      return;
} /* PrintBandArrayZ */

cblas_sgemmx.c

/*******************************************************************************
* Copyright 1999-2022 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/

/*
!  Content:
!      C B L A S _ S G E M M  Example Program Text ( C Interface )
!******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"

#include "mkl_example.h"

int main(int argc, char *argv[])
{
      FILE *in_file;
      char *in_file_name;

      MKL_INT         m, n, k;
      MKL_INT         lda, ldb, ldc;
      MKL_INT         rmaxa, cmaxa, rmaxb, cmaxb, rmaxc, cmaxc;
      float           alpha, beta;
      float          *a, *b, *c;
      CBLAS_LAYOUT    layout;
      CBLAS_TRANSPOSE transA, transB;
      MKL_INT         ma, na, mb, nb;

      printf("\n     C B L A S _ S G E M M  EXAMPLE PROGRAM\n");

/*       Get input parameters                                  */

      if( argc == 1 ) {
          printf("\n You must specify in_file data file as 1-st parameter");
          return 1;
      }
      in_file_name = argv[1];

/*       Get input data                                        */

      if( (in_file = fopen( in_file_name, "r" )) == NULL ) {
         printf("\n ERROR on OPEN '%s' with mode=\"r\"\n", in_file_name);
         return 1;
      }
      if( GetIntegerParameters(in_file, &m, &n, &k) != 3 ) {
          printf("\n ERROR of m, n, k reading\n");
          fclose(in_file);
          return 1;
      }
      if( GetScalarsS(in_file, &alpha, &beta) != 2 ) {
          printf("\n ERROR of alpha, beta reading\n");
          fclose(in_file);
          return 1;
      }
      if( GetCblasCharParameters(in_file, &transA, &transB, &layout) != 3 ) {
          printf("\n ERROR of transA, transB, layout reading\n");
          fclose(in_file);
          return 1;
      }

      if( transA == CblasNoTrans ) {
         rmaxa = m + 1;
         cmaxa = k;
         ma    = m;
         na    = k;
      } else {
         rmaxa = k + 1;
         cmaxa = m;
         ma    = k;
         na    = m;
      }
      if( transB == CblasNoTrans ) {
         rmaxb = k + 1;
         cmaxb = n;
         mb    = k;
         nb    = n;
      } else {
         rmaxb = n + 1;
         cmaxb = k;
         mb    = n;
         nb    = k;
      }
      rmaxc = m + 1;
      cmaxc = n;
      a = (float *)mkl_calloc(rmaxa*cmaxa, sizeof( float ), 64);
      b = (float *)mkl_calloc(rmaxb*cmaxb, sizeof( float ), 64);
      c = (float *)mkl_calloc(rmaxc*cmaxc, sizeof( float ), 64);
      if( a == NULL || b == NULL || c == NULL ) {
          printf( "\n Can't allocate memory for arrays\n");
          mkl_free(a);
          mkl_free(b);
          mkl_free(c);
          fclose(in_file);
          return 1;
      }
      if (layout == CblasRowMajor) {
          lda=cmaxa;
          ldb=cmaxb;
          ldc=cmaxc;
      } else {
          lda=rmaxa;
          ldb=rmaxb;
          ldc=rmaxc;
      }
      if( GetArrayS(in_file, &layout, GENERAL_MATRIX, &ma, &na, a, &lda) != 0 ) {
        printf("\n ERROR of array A reading\n");
        mkl_free(a);
        mkl_free(b);
        mkl_free(c);
        fclose(in_file);
        return 1;
      }
      if( GetArrayS(in_file, &layout, GENERAL_MATRIX, &mb, &nb, b, &ldb) != 0 ) {
        printf("\n ERROR of array B reading\n");
        mkl_free(a);
        mkl_free(b);
        mkl_free(c);
        fclose(in_file);
        return 1;
      }
      if( GetArrayS(in_file, &layout, GENERAL_MATRIX, &m,  &n,  c, &ldc) != 0 ) {
        printf("\n ERROR of array C reading\n");
        mkl_free(a);
        mkl_free(b);
        mkl_free(c);
        fclose(in_file);
        return 1;
      }
      fclose(in_file);

/*       Print input data                                      */

      printf("\n     INPUT DATA");
      printf("\n       M="INT_FORMAT"  N="INT_FORMAT"  K="INT_FORMAT, m, n, k);
      printf("\n       ALPHA=%5.1f  BETA=%5.1f", alpha, beta);
      PrintParameters("TRANSA, TRANSB", transA, transB);
      PrintParameters("LAYOUT", layout);
      PrintArrayS(&layout, FULLPRINT, GENERAL_MATRIX, &ma, &na, a, &lda, "A");
      PrintArrayS(&layout, FULLPRINT, GENERAL_MATRIX, &mb, &nb, b, &ldb, "B");
      PrintArrayS(&layout, FULLPRINT, GENERAL_MATRIX, &m, &n, c, &ldc, "C");

/*      Call SGEMM subroutine ( C Interface )                  */

      cblas_sgemm(layout, transA, transB, m, n, k, alpha,
                  a, lda, b, ldb, beta, c, ldc);

/*       Print output data                                     */

      printf("\n\n     OUTPUT DATA");
      PrintArrayS(&layout, FULLPRINT, GENERAL_MATRIX, &m, &n, c, &ldc, "C");

      mkl_free(a);
      mkl_free(b);
      mkl_free(c);

      return 0;
}

cblas_sgemmx.d

:  C B L A S _ S G E M M  EXAMPLE PROGRAM DATA
3 5 4                 :Values of m, n, k
0.5 1.2               :Values of alpha, beta
111 111 101           :Values of transA, transB, layout
1.0   2.0   3.0   4.0
3.2   2.7   8.1   5.0
0.0   4.0   5.0   3.0    :Values of array A
1.0   2.0   3.0   4.0   5.0
1.0   2.0   3.0   4.0   5.0
1.0   2.0   3.0   4.0   5.0
1.0   2.0   3.0   4.0   5.0    :Values of array B
0.0   0.0   1.0   1.0   1.0
0.0   0.0   1.0   1.0   1.0
0.0   0.0   1.0   1.0   1.0    :Values of array C

CMakeLists.txt

#===============================================================================
# Copyright 2023 Intel Corporation.
#
# This software and the related documents are Intel copyrighted  materials,  and
# your use of  them is  governed by the  express license  under which  they were
# provided to you (License).  Unless the License provides otherwise, you may not
# use, modify, copy, publish, distribute,  disclose or transmit this software or
# the related documents without Intel's prior written permission.
#
# This software and the related documents  are provided as  is,  with no express
# or implied  warranties,  other  than those  that are  expressly stated  in the
# License.
#===============================================================================

cmake_minimum_required(VERSION 3.13)
enable_testing()

# Define language and file extensions
set(TEST_LANG C)
set(TEST_EXT c)
set(DATA_EXT d)

project(MKL_Example LANGUAGES ${TEST_LANG})

# Include MKLConfig.cmake
find_package(MKL CONFIG REQUIRED)

# Build extra files
add_library(blas_common common_func.c)
if(WIN32)
  set(TEST_COPT "-D_CRT_SECURE_NO_WARNINGS")
endif()
# Add some parts of MKL target: headers and compiler options
target_include_directories(blas_common PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
target_compile_options(blas_common PUBLIC ${TEST_COPT} $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
set(TEST_LOPT blas_common)

# Build target
set(func "example")
set(func_name "cblas_sgemmx")
set(${func}_SRC ${func_name}.${TEST_EXT})

add_executable(${func} ${${func}_SRC})
set_property(TARGET ${func} PROPERTY C_STANDARD 99)
# Add MKL target: headers, compiler options and libraries to link
target_link_libraries(${func} PUBLIC ${TEST_LOPT} MKL::MKL)

# Register example as ctest
set(${func_name}_DATA ${PROJECT_SOURCE_DIR}/${func_name}.${DATA_EXT})
add_test(NAME ${func} COMMAND ${func} ${${func_name}_DATA})

# Add oneMKL environment variables if any defined
if(MKL_ENV)
  set_tests_properties(${func} PROPERTIES ENVIRONMENT "${MKL_ENV}")
endif()

ScaLAPACK Example

matrix_generator.h

/*******************************************************************************
* Copyright 2022 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************/
/*
!  Content:
!      Intel(R) oneAPI Math Kernel Library ScaLAPACK C example source file
!
!*******************************************************************************
!===============================================================================
!  Matrix generator routines for ScaLAPACK p?getrf and p?getrs example program 
!===============================================================================
! List of routines are given in the file:
!
! pdmatgen_random
! psmatgen_random
! pzmatgen_random
! psmatgen_random
!
! Definitions for element index numbers of ScaLAPACK array descriptors 
*/
#include "mkl.h"
#include "mkl_blacs.h"
#include "mkl_pblas.h"
#include "mkl_scalapack.h"

#define DTYPE_ 1
#define CTXT_ 2
#define M_ 3
#define N_ 4
#define MB_ 5
#define NB_ 6
#define RSRC_ 7
#define CSRC_ 8
#define LLD_ 9
/* Additional  definitions */
#ifndef min 
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) ((a) > (b) ? (a) : (b)) 
#endif
/* Local real and complex constants */
#define RZERO 0.0
#define RONE  1.0
#define RNEGONE  -1.0
#define CZERO {0.0f, 0.0f}
#define CONE  {1.0f, 0.0f}
#define CNEGONE  {-1.0f, 0.0f}

const MKL_INT izero = 0, ione = 1, isix = 6, inegone = -1, dist=1;

void pdmatgen_random(MKL_INT type, MKL_INT* desca, double* a, MKL_INT*  seed,  MKL_INT nprow, MKL_INT npcol)
/* 
*  Purpose
*  =======
*
*  pdmatgen_random: parallel real double precision matrix generator.
*  Generate a sub-matrix of A.
*
*  Arguments
*  =========
*
* type    (global input)
*          Type of global matrix to be generated.
*          If type==0, local matrices are parts of the matrix 
*          which possesses 1 as all elements along its leading (main) diagonal;
*          while all other elements in the matrix are 0. 
*          A random matrix is generated if type != 0.
*
*  desca   (global and local input) an array of (at least) length 9.
*          The array descriptor for the distributed matrix A.
*          See the notes below for the decription of each element.
*
*  a       (local output), pointer into the local
*          memory to an array containing the
*          local pieces of the distributed matrix.
*
*  seed    (local input and output) integer array, dimension (4)
*          On entry, the seed of the random number generator.
*          On exit, the seed is updated. 
*
*  nprow   (global input)
*          The number of process rows in the grid.
*
*  npcol   (global input)
*          The number of process columns in the grid.
*
*
*  Notes:
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1. 
*/
{
    MKL_INT myrow, mycol;
    MKL_INT iarow = 0, iacol = 0;
    MKL_INT ictxt = desca[CTXT_-1];
    MKL_INT m     = desca[M_-1];
    MKL_INT n     = desca[N_-1];
    MKL_INT mb    = desca[MB_-1];
    MKL_INT nb    = desca[NB_-1];
    MKL_INT lld   = desca[LLD_-1];
    MKL_INT i, j, ii, jj, il = 0, jl = 0;

    blacs_gridinfo(&ictxt, &nprow, &npcol, &myrow, &mycol);
    for (j = 0; j < n; j += nb) {
       for (i = 0; i < m; i += mb) {
          i++; j++;
          infog2l(&i, &j, desca, &nprow, &npcol, &myrow, &mycol, &il, &jl, &iarow, &iacol);
          i--; j--; il--; jl--;
          if (iarow==myrow && iacol==mycol) {
             for (jj = 0; jj <  min(nb, n - j); jj++) {
                for (ii = 0; ii < min(mb, m - i); ii++) {
                    if(type) {
                       a[il+ii+(jl+jj)*lld] = (double) dlarnd( &dist, seed );
                    } else {
                        if(i==j) a[il+ii+(jl+jj)*lld] = RONE;
                        else a[il+ii+(jl+jj)*lld] = RZERO;
                    }
                }
             }
          }
       }
    }
}

void psmatgen_random(MKL_INT type, MKL_INT* desca, float* a, MKL_INT*  seed,  MKL_INT nprow, MKL_INT npcol)
{
/* 
*  Purpose
*  =======
*
*  psmatgen_random: parallel real float precision matrix generator.
*  Generate a sub-matrix of A.
*
*  Arguments
*  =========
*
* type    (global input)
*          Type of global matrix to be generated.
*          If type==0, local matrices are parts of the matrix 
*          which possesses 1 as all elements along its leading (main) diagonal;
*          while all other elements in the matrix are 0. 
*          A random matrix is generated if type != 0.
*
*  desca   (global and local input) an array of (at least) length 9.
*          The array descriptor for the distributed matrix A.
*          See the notes below for the decription of each element.
*
*  a       (local output), pointer into the local
*          memory to a float array containing the
*          local pieces of the distributed matrix.
*
*  seed    (local input and output) integer array, dimension (4)
*          On entry, the seed of the random number generator.
*          On exit, the seed is updated.
*
*  nprow   (global input)
*          The number of process rows in the grid.
*
*  npcol   (global input)
*          The number of process columns in the grid.
*
*
*  Notes:
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
* 
* 
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1.
*/

    MKL_INT myrow, mycol;
    MKL_INT iarow = 0, iacol = 0;
    MKL_INT ictxt = desca[CTXT_-1];
    MKL_INT m     = desca[M_-1];
    MKL_INT n     = desca[N_-1];
    MKL_INT mb    = desca[MB_-1];
    MKL_INT nb    = desca[NB_-1];
    MKL_INT lld   = desca[LLD_-1];
    MKL_INT i, j, ii, jj, il = 0, jl = 0;

    blacs_gridinfo(&ictxt, &nprow, &npcol, &myrow, &mycol);
    for (j = 0; j < n; j += nb) {
       for (i = 0; i < m; i += mb) {
          i++; j++;
          infog2l(&i, &j, desca, &nprow, &npcol, &myrow, &mycol, &il, &jl, &iarow, &iacol);
          i--; j--; il--; jl--;
          if ( iarow==myrow && iacol==mycol ) {
             for (jj = 0; jj <  min(nb, n - j); jj++) {
                for (ii = 0; ii < min(mb, m - i); ii++) {
                    if( type ) {
                       a[il+ii+(jl+jj)*lld] = (float) dlarnd( &dist, seed );
                    } else {
                        if(i==j) a[il+ii+(jl+jj)*lld] = RONE;
                        else a[il+ii+(jl+jj)*lld] = RZERO;
                    }
                }
             }
          }
       }
    }
}

void pzmatgen_random(MKL_INT type, MKL_INT* desca, MKL_Complex16* a, MKL_INT*  seed,  MKL_INT nprow, MKL_INT npcol)
{
/*
*  Purpose
*  =======
*
*  pzmatgen_random: parallel double complex precision matrix generator.
*  Generate a sub-matrix of A.
*
*  Arguments
*  =========
*
*  type    (global input)
*          Type of global matrix to be generated.
*          If type==0, local matrices are parts of the matrix 
*          which possesses 1 as all elements along its leading (main) diagonal;
*          while all other elements in the matrix are 0. 
*          Otherwise a random matrix is generated.

*  type    (global input)
*          Type of global matrix to be generated.
*          If type==0, local matrices are parts of the identity matrix. 
*          Otherwise a random matrix is generated.
*
*  desca   (global and local input) an array of (at least) length 9.
*          The array descriptor for the distributed matrix A.
*          See the notes below for the decription of each element.
*
*  a       (local output), pointer into the local
*          memory to a MKL_Complex16 array containing the
*          local pieces of the distributed matrix.
*
*  seed    (local input and output) integer array, dimension (4)
*          On entry, the seed of the random number generator.
*          On exit, the seed is updated.
* 
*  nprow   (global input)
*          The number of process rows in the grid.
*
*  npcol   (global input)
*          The number of process columns in the grid.
*
*  Notes:
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1.
*/

    MKL_INT myrow, mycol;
    MKL_INT iarow = 0, iacol = 0;
    MKL_INT ictxt = desca[CTXT_-1];
    MKL_INT m     = desca[M_-1];
    MKL_INT n     = desca[N_-1];
    MKL_INT mb    = desca[MB_-1];
    MKL_INT nb    = desca[NB_-1];
    MKL_INT lld   = desca[LLD_-1];
    MKL_INT i, j, ii, jj, il = 0, jl = 0;

    blacs_gridinfo(&ictxt, &nprow, &npcol, &myrow, &mycol);
    for (j = 0; j < n; j += nb) {
       for (i = 0; i < m; i += mb) {
          i++; j++;
          infog2l(&i, &j, desca, &nprow, &npcol, &myrow, &mycol, &il, &jl, &iarow, &iacol);
          i--; j--; il--; jl--;
          if (iarow==myrow && iacol==mycol) {
             for (jj = 0; jj <  min(nb, n - j); jj++) {
                for (ii = 0; ii < min(mb, m - i); ii++) {
                    if( type ) {
                       a[il+ii+(jl+jj)*lld].real = (double) dlarnd( &dist, seed );
                       a[il+ii+(jl+jj)*lld].imag = (double) dlarnd( &dist, seed );
                    } else {
                        if( i==j ) {
                            a[il+ii+(jl+jj)*lld].real = RONE;
                            a[il+ii+(jl+jj)*lld].imag = RZERO;
                        }  else {
                            a[il+ii+(jl+jj)*lld].real = RZERO;
                            a[il+ii+(jl+jj)*lld].imag = RZERO;
                        }
                    }
                }
             }
          }
       }
    }
}
 
void pcmatgen_random(MKL_INT type, MKL_INT* desca, MKL_Complex8* a, MKL_INT*  seed,  MKL_INT nprow, MKL_INT npcol)
{
/*
*  Purpose
*  =======
*
*  pcmatgen_random: parallel single complex precision matrix generator.
*  Generate a sub-matrix of A.
*
*  Arguments
*  =========
*
*  type    (global input)
*          Type of global matrix to be generated.
*          If type==0, local matrices are parts of the matrix 
*          which possesses 1 as all elements along its leading (main) diagonal;
*          while all other elements in the matrix are 0. 
*          Otherwise a random matrix is generated.
*
*  desca   (global and local input) an array of (at least) length 9.
*          The array descriptor for the distributed matrix A.
*          See the notes below for the decription of each element.
*
*  a       (local output), pointer into the local
*          memory to a MKL_Complex8 array containing the
*          local pieces of the distributed matrix.
*
*  seed    (local input and output) integer array, dimension (4)
*          On entry, the seed of the random number generator.
*          On exit, the seed is updated.
*
*  nprow   (global input)
*          The number of process rows in the grid.
*
*  npcol   (global input)
*          The number of process columns in the grid.
*
*
*  Notes:
*  =====
*
*  Each global data object is described by an associated description
*  vector.  This vector stores the information required to establish
*  the mapping between an object element and its corresponding process
*  and memory location.
*
*
*  In  the  following  comments,   the character _  should  be  read  as
*  "of  the  distributed  matrix".  Let  A  be a generic term for any 2D
*  block cyclicly distributed matrix.  Its description vector is DESC_A:
*
*  NOTATION         STORED IN       EXPLANATION
*  ---------------- --------------- ------------------------------------
*  DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
*  CTXT_A  (global) DESCA[ CTXT_  ] The BLACS context handle, indicating
*                                   the NPROW x NPCOL BLACS process grid
*                                   A  is  distributed over. The context
*                                   itself  is  global,  but  the handle
*                                   (the integer value) may vary.
*  M_A     (global) DESCA[ M_     ] The  number of rows in the distribu-
*                                   ted matrix A, M_A >= 0.
*  N_A     (global) DESCA[ N_     ] The number of columns in the distri-
*                                   buted matrix A, N_A >= 0.
*  IMB_A   (global) DESCA[ IMB_   ] The number of rows of the upper left
*                                   block of the matrix A, IMB_A > 0.
*  INB_A   (global) DESCA[ INB_   ] The  number  of columns of the upper
*                                   left   block   of   the  matrix   A,
*                                   INB_A > 0.
*  MB_A    (global) DESCA[ MB_    ] The blocking factor used to  distri-
*                                   bute the last  M_A-IMB_A  rows of A,
*                                   MB_A > 0.
*  NB_A    (global) DESCA[ NB_    ] The blocking factor used to  distri-
*                                   bute the last  N_A-INB_A  columns of
*                                   A, NB_A > 0.
*  RSRC_A  (global) DESCA[ RSRC_  ] The process row over which the first
*                                   row of the matrix  A is distributed,
*                                   NPROW > RSRC_A >= 0.
*  CSRC_A  (global) DESCA[ CSRC_  ] The  process column  over  which the
*                                   first column of  A  is  distributed.
*                                   NPCOL > CSRC_A >= 0.
*  LLD_A   (local)  DESCA[ LLD_   ] The  leading dimension  of the local
*                                   array  storing  the  local blocks of
*                                   the distributed matrix A,
*                                   IF( Lc( 1, N_A ) > 0 )
*                                      LLD_A >= MAX( 1, Lr( 1, M_A ) )
*                                   ELSE
*                                      LLD_A >= 1.
*/
    MKL_INT myrow, mycol;
    MKL_INT iarow = 0, iacol = 0;
    MKL_INT ictxt = desca[CTXT_-1];
    MKL_INT m     = desca[M_-1];
    MKL_INT n     = desca[N_-1];
    MKL_INT mb    = desca[MB_-1];
    MKL_INT nb    = desca[NB_-1];
    MKL_INT lld   = desca[LLD_-1];
    MKL_INT i, j, ii, jj, il = 0, jl = 0;

    blacs_gridinfo(&ictxt, &nprow, &npcol, &myrow, &mycol);
    for (j = 0; j < n; j += nb) {
       for (i = 0; i < m; i += mb) {
          i++; j++;
          infog2l(&i, &j, desca, &nprow, &npcol, &myrow, &mycol, &il, &jl, &iarow, &iacol);
          i--; j--; il--; jl--;
          if (iarow==myrow && iacol==mycol) {
             for (jj = 0; jj <  min(nb, n - j); jj++) {
                for (ii = 0; ii < min(mb, m - i); ii++) {
                    if( type ) {
                       a[il+ii+(jl+jj)*lld].real = (float) dlarnd( &dist, seed );
                       a[il+ii+(jl+jj)*lld].imag = (float) dlarnd( &dist, seed );
                    } else {
                        if( i==j ) {
                            a[il+ii+(jl+jj)*lld].real = RONE;
                            a[il+ii+(jl+jj)*lld].imag = RZERO;
                        }  else {
                            a[il+ii+(jl+jj)*lld].real = RZERO;
                            a[il+ii+(jl+jj)*lld].imag = RZERO;
                        }
                    }
                }
             }
          }
       }
    }
}

psgetrf_example.c

/*******************************************************************************
* Copyright 2022 Intel Corporation.
*
* This software and the related documents are Intel copyrighted  materials,  and
* your use of  them is  governed by the  express license  under which  they were
* provided to you (License).  Unless the License provides otherwise, you may not
* use, modify, copy, publish, distribute,  disclose or transmit this software or
* the related documents without Intel's prior written permission.
*
* This software and the related documents  are provided as  is,  with no express
* or implied  warranties,  other  than those  that are  expressly stated  in the
* License.
*******************************************************************************
!  Content:
!      Intel(R) oneAPI Math Kernel Library ScaLAPACK C example source file
!
!*****************************************************************************
!===============================================================================
!==== ScaLAPACK psgetrf and psgetrs example program ============================
!===============================================================================
!
! Example solves a system of distributed linear equations
!
!       A * X = Y  or  A * X =  I
!
!  with a general random n-by-n distributed matrix A using the LU
!  factorization computed by psgetrf.
!  Y is a random n-by-nrhs distributed matrix, 
!  I denotes n-by-nrhs matrix which possesses 1 as all 
!  elements along its leading (main) diagonal; 
!  while all other elements in the matrix are 0. 
!  If n==nrhs, I is the identity matrix and X is the matrix inverse. 
!
! Example also demonstrates BLACS routines usage.
!
! It works with block-cyclic distribution on 2D process grid
!
! List of routines demonstrated in the example:
!
! psgetrf
! psgetrs 
! pslamch 
! pslange
! psgemr2d
! psgemm
! blacs_get
! blacs_gridinit
! blacs_gridinfo
! blacs_gridexit
! blacs_exit 
! descinit
! igamx2d 
! igebr2d
! igebs2d  
! sgebr2d
! sgebs2d
! numroc
! indxg2p
!
!  The program must be driven by a short data file.
!
! 0               type, problem type, if type=0 the system A*X = I is solved, otherwise A*X = Y
! 2000            n, dimension of matrix, 0 < n
! 1000            nrhs,number of right hand sides
! 16              nb, size of blocks, must be > 0
! 2               p, number of rows in the process grid, must be > 0
! 2               q, number of columns in the process grid, must be > 0, p*q = number of processes
! 20.0            threshold for residual check
!
!*******************************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include "matrix_generator.h"

MKL_INT test_solve(MKL_INT ictxt, MKL_INT type, MKL_INT n, MKL_INT nrhs, MKL_INT nb, MKL_INT nprow,  MKL_INT npcol, MKL_INT* seed, float thresh)
{
    MKL_INT info = 0;
    MKL_INT err = 0;

    float  *A = NULL;
    float  *A_INI = NULL;
    float  *X = NULL;
    float  *Y = NULL;
    MKL_INT   *IPIV = NULL;
/*  Matrix descriptors */
    MKL_INT  desca[9];
    MKL_INT  descx[9];
    MKL_INT  descy[9];
/*  Local variables */

    MKL_INT myrow;
    MKL_INT mycol;
    MKL_INT iam, nprocs;
    float eps;
    float normA, normRhs, normSol, normRes, residual;

    float fzero = RZERO;
    float fone = RONE;
    float fnegone = RNEGONE; 
/*  Compute machine epsilon */
    eps = pslamch( &ictxt, "e" );
/*  Get information about how many processes are used for program execution
    and number of current process */
    blacs_pinfo( &iam, &nprocs );      
    blacs_gridinfo(&ictxt, &nprow, &npcol, &myrow, &mycol);
/*  Compute precise length of local pieces of distributed matrices */ 
    MKL_INT loc_ra = numroc(&n, &nb, &myrow, &izero, &nprow);
    MKL_INT loc_ca = numroc(&n, &nb, &mycol, &izero, &npcol);
    MKL_INT loc_lda = max(1, loc_ra);
    MKL_INT loc_sizeA =loc_ra*loc_ca;
   
    MKL_INT loc_rx = numroc(&n, &nb, &myrow, &izero, &nprow);
    MKL_INT loc_cx = numroc(&nrhs, &nb, &mycol, &izero, &npcol);
    MKL_INT loc_ldx = max(1, loc_rx);
    MKL_INT loc_ry = numroc(&n, &nb, &myrow, &izero, &nprow); 
    MKL_INT loc_cy = numroc(&nrhs, &nb, &mycol, &izero, &npcol);
    MKL_INT loc_ldy = max(1, loc_ry);
/*  Initialize descriptors for distributed arrays */  

    descinit(desca, &n,  &n,   &nb, &nb, &izero, &izero, &ictxt, &loc_lda, &info);
    descinit(descx, &n, &nrhs, &nb, &nb, &izero, &izero, &ictxt, &loc_ldx, &info);
    descinit(descy, &n, &nrhs, &nb, &nb, &izero, &izero, &ictxt, &loc_ldy, &info);

/*  Allocate matrix, right-hand-side, vector solution x and the IPIV array
    containing the pivoting information. The array A_INI is used to store
    original matrix */
    if( loc_sizeA ) {
        A = (float*)mkl_malloc( loc_sizeA * sizeof(float), 64 );
        A_INI=(float*)mkl_malloc( loc_sizeA * sizeof(float), 64 );
    }
    if( loc_rx * loc_cx ) X = (float*)mkl_malloc( loc_rx * loc_cx * sizeof(float), 64 );
    if( loc_ry * loc_cy ) Y = (float*)mkl_malloc( loc_ry * loc_cy * sizeof(float), 64 );
    IPIV = (MKL_INT*)mkl_malloc( (loc_ra + nb)*sizeof(MKL_INT), 64 );
    if( A == NULL || X == NULL || Y == NULL || A_INI == NULL || IPIV == NULL ) info = 1;
    igamx2d(&ictxt, "ALL", " ", &ione, &ione, &info, &ione, NULL, NULL, &inegone, &inegone, &inegone);
    if( info ) {
        err = 1;
        if ( iam == 0 ) printf( "\n Can't allocate memory for arrays\n" );
        goto early_exit;
    }
/*  Generate matrix and solution. Copy the initial matrix. */
    seed[0]+= myrow + mycol;
    psmatgen_random(1, desca, A, seed, nprow, npcol);
    scopy(&loc_sizeA, A, &ione, A_INI, &ione);
    psmatgen_random(type, descx, X, seed, nprow, npcol);
    psmatgen_random(type, descy, Y, seed, nprow, npcol);
/*  Compute the right hand side */
    if( type ) {
        psgemm("N", "N", &n, &nrhs, &n, &fone, A, &ione, &ione, desca,
                X, &ione, &ione, descx,
                &fzero, Y, &ione,  &ione,  descy);
/*  Copy the right hand side Y to X */
        psgemr2d(&n, &nrhs, Y, &ione, &ione, descy, X, &ione, &ione, descx, &ictxt);
    }

/*  Compute norm of the initial matrix and right hand side  */
    normA = pslange("M", &n, &n, A, &ione, &ione, desca, NULL);
    normRhs = pslange("M", &n, &nrhs, Y, &ione, &ione, descy, NULL);

/*  Compute the LU factorization of the initial matrix */
    psgetrf(&n, &n, A, &ione, &ione, desca, IPIV, &info);
    igamx2d(&ictxt, "ALL", " ", &ione, &ione, &info, &ione, NULL, NULL, &inegone, &inegone, &inegone);
/* If psgetrf fails, deallocate all arays and report an error */
    if( info) {
        err = 1;
        if ( iam == 0 ) printf( "\n psgetrf fails to compute LU decomposition \n" );
        goto early_exit;
    }

    psgetrs("N", &n, &nrhs, A, &ione, &ione, desca, IPIV, X, &ione, &ione, descx, &info);
    igamx2d(&ictxt, "ALL", " ", &ione, &ione, &info, &ione, NULL, NULL, &inegone, &inegone, &inegone);
/* If psgetrs fails, deallocate all arays and report an error */
    if( info ) {
        err = 1;
        if ( iam == 0 ) printf( "\n psgetrs fails to solve \n" );
        goto early_exit;
    }
/* Normsol is the norm of the solution matrix:  normsol = || X|| */
    normSol = pslange("M", &n, &nrhs, X, &ione, &ione, descx, NULL);


/* Compute the residual matrix || A*X - Y ||  */
    psgemm("N", "N", &n, &nrhs, &n, &fone, A_INI, &ione, &ione, desca,
            X, &ione, &ione, descx,
            &fnegone, Y, &ione,  &ione,  descy);
/* Compute the residual as: 
   residual = ||A*X-Y||/(n*eps*(||A||*||X||+||Y||)) */

    normRes = pslange("M", &n, &nrhs, Y, &ione, &ione, descy, NULL);
    residual = normRes/( ( (float)n ) * eps *( normA * normSol + normRhs ) );

/*  Check if residual passed or failed the threshold */
    if( iam == 0 ) {
        if ( ( thresh >= RZERO ) && !( residual <= thresh ) ){
            printf( "FAILED. Residual = %05.11f\n", residual );
            err = 1;
        } else {
            printf( "PASSED. Residual = %05.11f\n", residual );
        }
        printf( "=== END OF EXAMPLE =====================\n" ); 
    }
early_exit :
/*  Destroy arrays */
    if( A ) mkl_free(A);
    if( A_INI ) mkl_free(A_INI);
    if( X ) mkl_free(X);
    if( Y ) mkl_free(Y);
    if( IPIV ) mkl_free(IPIV); 
    return err;
}

int main(int argc, char *argv[])
{
/*  ==== Declarations =================================================== */

/*  File variables */
    FILE    *fin;

/*  Local scalars */
    MKL_INT iam, nprocs, ictxt=-1, nprow, npcol;
    MKL_INT n, nb, nrhs;
    MKL_INT err;
    MKL_INT type=1;
    int     n_int, nb_int, nprow_int, npcol_int, nrhs_int, type_int;
    float  thresh;
    MKL_INT seed[4] = { 105, 1410, 1860, 4085 };

/*  Local arrays */
    MKL_INT iw[ 6 ];

/*  ==== Executable statements ========================================== */

/*  Get information about how many processes are used for program execution
    and number of current process */
    blacs_pinfo( &iam, &nprocs );

/*  Init temporary 1D process grid */
    blacs_get( &inegone, &izero, &ictxt );
    blacs_gridinit( &ictxt, "C", &nprocs, &ione );

/*  Open input file */
    if ( iam == 0 ) {
        fin = fopen( "psgetrf_example.in", "r" );
        if ( fin == NULL ) {
            printf( "Error while open input file." );
            return 1;
        }
    }

/*  Read data and send it to all processes */
    if ( iam == 0 ) {

/*      Read parameters */
        fscanf( fin, "%d type, problem type, if type=0 the system A*X = I is solved, otherwise A*X = Y", &type_int );
        fscanf( fin, "%d n, dimension of matrix, 0 < n", &n_int );
        fscanf( fin, "%d nrhs,number of right hand sides, 0 < nrhs", &nrhs_int );
        fscanf( fin, "%d nb, size of blocks, must be > 0", &nb_int );
        fscanf( fin, "%d p, number of rows in the process grid, must be > 0", &nprow_int );
        fscanf( fin, "%d q, number of columns in the process grid, must be > 0, p*q = number of processes", &npcol_int );
        fscanf( fin, "%f threshold for residual check", &thresh );
        fclose( fin );
        type = (MKL_INT) type_int;
        n = (MKL_INT) n_int;
        nrhs = (MKL_INT) nrhs_int;
        nb = (MKL_INT) nb_int;
        nprow = (MKL_INT) nprow_int;
        npcol = (MKL_INT) npcol_int;
/*      Check if all parameters are correct */
        if( ( n<=0 )||( nrhs<=0 )||( nb<=0 )||( nprow<=0 )||
            ( npcol<=0 )||( nprow*npcol != nprocs ) ) {
            printf( "One or several input parameters has incorrect value. Limitations:\n" );
            printf( "n > 0, nrhs > 0, nb > 0, p > 0, q > 0 - integer\n" );
            printf( "p*q = number of processes\n" );
            printf( "threshold - float \n");
            return 1;
        }

/*      Pack data into array and send it to other processes */
        iw[ 0 ] = type;
        iw[ 1 ] = n;
        iw[ 2 ] = nrhs;
        iw[ 3 ] = nb;
        iw[ 4 ] = nprow;
        iw[ 5 ] = npcol;
        igebs2d( &ictxt, "All", " ", &isix, &ione, iw, &isix );
        sgebs2d( &ictxt, "All", " ", &ione, &ione, &thresh, &ione );
    } else {

/*      Recieve and unpack data */
        igebr2d( &ictxt, "All", " ", &isix, &ione, iw, &isix, &izero, &izero );
        sgebr2d( &ictxt, "All", " ", &ione, &ione, &thresh, &ione, &izero, &izero );
        type = iw[ 0 ];
        n = iw[ 1 ];
        nrhs = iw[ 2 ];
        nb = iw[ 3 ];
        nprow = iw[ 4 ];
        npcol = iw[ 5 ];
    }
/*  Destroy temporary process grid */
    blacs_gridexit( &ictxt ); 
/*  Init workind 2D process grid */ 
    blacs_get(&inegone, &izero, &ictxt);
    blacs_gridinit(&ictxt, "R", &nprow, &npcol);
    if ( iam == 0 ) {
/*      Print information of task */
        printf( "=== START OF EXAMPLE ===================\n" );
        if( type ) {
            printf( "Solve matrix equation: A*X = Y\n" );
            printf( "with a general random n-by-n distributed matrix A \n" );
            printf( "Y is a random n-by-nrhs distributed matrix\n\n" );
        } else {
            printf( "Solve matrix equation: A*X = I\n" );
            printf( "with a general random n-by-n distributed matrix A \n" );
            printf( "I denotes a n-by-nrhs matrix which possesses 1 \n" ); 
            printf( "as all elements along its leading (main) diagonal \n" );
            printf( "while all other elements in the matrix are 0. \n" ); 
            printf( "If n==nrhs, I is the identity matrix and X is the matrix inverse of A.\n\n" );
        }
        printf( "n = %d, nrhs = %d, nb = %d; %dx%d - process grid\n\n", n_int, nrhs_int, nb_int, nprow_int, npcol_int);
        printf( "Threshold for residual check = %05.11f\n", thresh );
    }
    err = test_solve(ictxt, type, n, nrhs, nb,  nprow, npcol, seed, thresh);
/*  Destroy process grid */
    blacs_gridexit(&ictxt);
    blacs_exit(&izero);
    return err;
}

psgetrf_example.in

0		type, problem type, if type=0 the system A*X = I is solved, otherwise A*X = Y
2000		n, dimension of matrix, 0 < n
1000		nrhs,number of right hand sides
16		nb, size of blocks, must be > 0
2		p, number of rows in the process grid, must be > 0
2		q, number of columns in the process grid, must be > 0, p*q = number of processes
20.0		threshold for residual check

CMakeLists.txt

#===============================================================================
# Copyright 2023 Intel Corporation.
#
# This software and the related documents are Intel copyrighted  materials,  and
# your use of  them is  governed by the  express license  under which  they were
# provided to you (License).  Unless the License provides otherwise, you may not
# use, modify, copy, publish, distribute,  disclose or transmit this software or
# the related documents without Intel's prior written permission.
#
# This software and the related documents  are provided as  is,  with no express
# or implied  warranties,  other  than those  that are  expressly stated  in the
# License.
#===============================================================================

cmake_minimum_required(VERSION 3.13)
enable_testing()

# Enable SCALAPACK in oneMKL
# See MKLConfig.cmake top section for all cluster libraries
set(ENABLE_SCALAPACK ON)

# Define language and compiler
set(TEST_LANG C)
set(TEST_EXT c)

project(MKL_Example LANGUAGES ${TEST_LANG})
# Include oneMKL config file
find_package(MKL CONFIG REQUIRED)

# W/A for known problem in Intel(R) MPI 2021.1
if(MKL_MPI STREQUAL "intelmpi" AND DEFINED ENV{I_MPI_ROOT})
  if(UNIX AND NOT APPLE AND $ENV{I_MPI_ROOT} MATCHES "2021.1")
    set(MPI_C_ADDITIONAL_INCLUDE_DIRS $ENV{I_MPI_ROOT}/include)
  endif()
endif()

# Force mshpc to be the first in mpi find package
if(MKL_MPI STREQUAL "msmpi" OR MKL_MPI STREQUAL "mshpc")
  set(MPI_GUESS_LIBRARY_NAME MSMPI)
endif()

find_package(MPI REQUIRED)

# Override default compile/link lines for mpi script
if(USE_MPI_SCRIPT AND WIN32)
  set(CMAKE_C_COMPILE_OBJECT "<CMAKE_C_COMPILER> <DEFINES> <INCLUDES> <FLAGS> /Fo<OBJECT> -c <SOURCE>")
  set(CMAKE_C_LINK_EXECUTABLE "<CMAKE_C_COMPILER> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
endif()

# Try to identify MPI for correct BLACS if MKL_MPI is not defined
if(MPI_${TEST_LANG}_FOUND AND NOT MKL_MPI)
  if(UNIX AND NOT APPLE AND MPI_${TEST_LANG}_COMPILER MATCHES "openmpi")
    set(CURRENT_MPI "openmpi")
  elseif(WIN32 AND MPI_${TEST_LANG}_COMPILER MATCHES "msmpi" OR MPI_${TEST_LANG}_COMPILER MATCHES "Microsoft MPI")
    set(CURRENT_MPI "msmpi")
  elseif(MPI_${TEST_LANG}_COMPILER MATCHES "mpich")
    set(CURRENT_MPI "mpich")
  elseif(NOT APPLE AND MPI_${TEST_LANG}_COMPILER MATCHES ".*intel.*mpi.*")
    set(CURRENT_MPI "intelmpi")
    if(WIN32 AND MPI_LOCALONLY)
      set(MPI_LOCAL_OPT -localonly)
    endif()
  endif()
  set(MKL_MPI "${CURRENT_MPI}")
endif()

# Define target
set(TEST_INCLUDE "${MPI_${TEST_LANG}_INCLUDE_DIRS}")

# Build example
set(func_name "psgetrf_example")
set(func "${func_name}")

file(GLOB_RECURSE ${func}_SRC ${PROJECT_SOURCE_DIR}/${func_name}.${TEST_EXT})
add_executable(${func} ${${func}_SRC})
target_include_directories(${func} PUBLIC ${TEST_INCLUDE})
if(WIN32)
  set(TEST_COPT "-D_CRT_SECURE_NO_WARNINGS")
  target_compile_options(${func} PUBLIC ${TEST_COPT})
endif()
target_link_libraries(${func} PUBLIC MKL::MKL MPI::MPI_C)

# Register example as ctest
configure_file(${PROJECT_SOURCE_DIR}/${func_name}.in ${CMAKE_CURRENT_BINARY_DIR} COPYONLY)
add_test(NAME ${func} COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPI_LOCAL_OPT} ${PROJECT_BINARY_DIR}/${func})

# Add oneMKL environment variables
if(MKL_ENV)
  set_tests_properties(${func} PROPERTIES ENVIRONMENT "${MKL_ENV}")
endif()