Intel® oneAPI Deep Neural Network Developer Guide and Reference

ID 768875
Date 2/28/2024
Public

A newer version of this document is available. Customers should click here to go to the newest version.

Document Table of Contents

Bnorm u8 by binary post-ops example

The example implements the Batch normalization u8 via the following operations: binary_sub(src, mean), binary_div(tmp_dst, variance), binary_mul(tmp_dst, scale), binary_add(tmp_dst, shift).

The example implements the Batch normalization u8 via the following operations: binary_sub(src, mean), binary_div(tmp_dst, variance), binary_mul(tmp_dst, scale), binary_add(tmp_dst, shift).

Some key take-aways include:

  • How tensors are implemented and submitted to primitives.

  • How primitives are created.

  • How to use multiple binary post operations.

  • How to use different data types in binary.

/*******************************************************************************
* Copyright 2020-2022 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/


#include <algorithm>
#include <cmath>
#include <iostream>
#include <string>
#include <vector>

#include "dnnl.hpp"
#include "example_utils.hpp"

using namespace dnnl;

using tag = memory::format_tag;
using dt = memory::data_type;

void bnorm_u8_via_binary_postops(dnnl::engine::kind engine_kind) {

    // Create execution dnnl::engine.
    dnnl::engine engine(engine_kind, 0);

    // Create dnnl::stream.
    dnnl::stream engine_stream(engine);

    // Tensor dimensions.
    const memory::dim N = 3, // batch size
            IC = 3, // channels
            IH = 150, // tensor height
            IW = 150; // tensor width

    // Tensors dimensions.
    memory::dims src_dims = {N, IC, IH, IW};
    memory::dims params_dims = {1, IC, 1, 1};

    // Allocate buffers.
    std::vector<float> src_data(product(src_dims));
    std::vector<float> mean_data(product(params_dims));
    std::vector<float> variance_data(product(params_dims));
    std::vector<float> scale_data(product(params_dims));
    std::vector<float> shift_data(product(params_dims));
    std::vector<float> oscale_data(product(params_dims));

    // Initialize
    std::generate(src_data.begin(), src_data.end(), []() {
        static int i = 0;
        return std::cos(i++ / 10.f);
    });
    std::generate(mean_data.begin(), mean_data.end(), []() {
        static int i = 0;
        return std::sin(i++ * 2.f);
    });
    std::generate(variance_data.begin(), variance_data.end(), []() {
        static int i = 0;
        float value = std::abs(std::sin(i++ * 4.f));
        // Avoid division by zero. Variance should be positive.
        return value == 0.f ? 1.f : value;
    });
    std::generate(scale_data.begin(), scale_data.end(), []() {
        static int i = 0;
        return std::sin(i++ * 6.f);
    });
    std::generate(shift_data.begin(), shift_data.end(), []() {
        static int i = 0;
        return std::sin(i++ * 8.f);
    });
    std::generate(
            oscale_data.begin(), oscale_data.end(), []() { return 0.5f; });

    // Create descriptors.
    auto src_md = memory::desc(src_dims, dt::u8, tag::nhwc);
    auto mean_md = memory::desc(params_dims, dt::f32, tag::nhwc);
    auto variance_md = memory::desc(params_dims, dt::f32, tag::nhwc);
    auto scale_md = memory::desc(params_dims, dt::f32, tag::nhwc);
    auto shift_md = memory::desc(params_dims, dt::f32, tag::nhwc);
    auto oscale_md = memory::desc(params_dims, dt::f32, tag::nhwc);

    // Create src memory objects.
    auto src_mem = memory(src_md, engine);
    auto mean_mem = memory(mean_md, engine);
    auto variance_mem = memory(variance_md, engine);
    auto scale_mem = memory(scale_md, engine);
    auto shift_mem = memory(shift_md, engine);
    auto oscale_mem = memory(oscale_md, engine);

    // Write data to memory object's handle.
    write_to_dnnl_memory(src_data.data(), src_mem);
    write_to_dnnl_memory(mean_data.data(), mean_mem);
    write_to_dnnl_memory(variance_data.data(), variance_mem);
    write_to_dnnl_memory(scale_data.data(), scale_mem);
    write_to_dnnl_memory(shift_data.data(), shift_mem);
    write_to_dnnl_memory(oscale_data.data(), oscale_mem);

    // Bnorm operation with scale and shift
    post_ops binary_ops;
    // dst_tmp = dst_tmp / variance
    binary_ops.append_binary(algorithm::binary_div, variance_md);
    // dst_tmp = dst_tmp * scale
    binary_ops.append_binary(algorithm::binary_mul, scale_md);
    // dst_tmp = dst_tmp + shift
    binary_ops.append_binary(algorithm::binary_add, shift_md);
    // dst = dst_tmp * output_scale (only for re-quantization)
    binary_ops.append_binary(algorithm::binary_mul, oscale_md);
    primitive_attr binary_attr;
    binary_attr.set_post_ops(binary_ops);

    // Create primitive descriptor.
    // dst_tmp = src - mean
    auto binary_pd = binary::primitive_desc(engine, algorithm::binary_sub,
            src_md, mean_md, src_md, binary_attr);

    // Create the primitive.
    auto binary_prim = binary(binary_pd);

    // Primitive arguments.
    std::unordered_map<int, memory> binary_args;
    binary_args.insert({DNNL_ARG_SRC_0, src_mem});
    binary_args.insert({DNNL_ARG_SRC_1, mean_mem});
    // In-place mode (dst is src)
    binary_args.insert({DNNL_ARG_DST, src_mem});
    binary_args.insert(
            {DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, variance_mem});
    binary_args.insert(
            {DNNL_ARG_ATTR_MULTIPLE_POST_OP(1) | DNNL_ARG_SRC_1, scale_mem});
    binary_args.insert(
            {DNNL_ARG_ATTR_MULTIPLE_POST_OP(2) | DNNL_ARG_SRC_1, shift_mem});
    binary_args.insert(
            {DNNL_ARG_ATTR_MULTIPLE_POST_OP(3) | DNNL_ARG_SRC_1, oscale_mem});

    // Primitive execution
    binary_prim.execute(engine_stream, binary_args);

    // Wait for the computation to finalize.
    engine_stream.wait();

    // Read data from memory object's handle.
    read_from_dnnl_memory(src_data.data(), src_mem);
}

int main(int argc, char **argv) {
    return handle_example_errors(
            bnorm_u8_via_binary_postops, parse_engine_kind(argc, argv));
}