CUDA学习（四十七）-阿里云开发者社区

CUDA学习（四十七）

2018-02-15 1546

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介：

例子：
通过一个warp广播单个值：

#include <stdio.h>


__global__ void bcast(int arg) {
    int laneId = threadIdx.x & 0x1f;
    int value;
    if (laneId == 0) // Note unused variable for
        value = arg; // all threads except lane 0
    value = __shfl_sync(0xffffffff, value, 0); // Synchronize all threads in warp, and get "value" from lane 0
    if (value != arg)
        printf("Thread %d failed.\n", threadIdx.x);
    else
        printf("succeed\n");
}
int main() {
    bcast << < 1, 32 >> >(1234);
    cudaDeviceSynchronize();
    getchar();
    return 0;
}

包含8个线程的子分区的包含扫描：

#include <stdio.h>
__global__ void scan4() {
    int laneId = threadIdx.x & 0x1f;
    // Seed sample starting value (inverse of lane ID)
    int value = 31 - laneId;
    // Loop to accumulate scan within my partition.
    // Scan requires log2(n) == 3 steps for 8 threads
    // It works by an accumulated sum up the warp
    // by 1, 2, 4, 8 etc. steps.
    for (int i = 1; i <= 4; i *= 2) {
        // We do the __shfl_sync unconditionally so that we
        // can read even from threads which won't do a
        // sum, and then conditionally assign the result.
        int n = __shfl_up_sync(0xffffffff, value, i, 8);
        if ((laneId & 7) >= i)
            value += n;
    }
    printf("Thread %d final value = %d\n", threadIdx.x, value);
}
int main() {
    scan4 << < 1, 32 >> >();
    cudaDeviceSynchronize();
    return 0;
}

减少整个warp：

#include <stdio.h>
__global__ void warpReduce() {
    int laneId = threadIdx.x & 0x1f;
    // Seed starting value as inverse lane ID
    int value = 31 - laneId;
    // Use XOR mode to perform butterfly reduction
    for (int i = 16; i >= 1; i /= 2)
        value += __shfl_xor_sync(0xffffffff, value, i, 32);
    // "value" now contains the sum across all threads
    printf("Thread %d final value = %d\n", threadIdx.x, value);
}
int main() {
    warpReduce << < 1, 32 >> >();
    cudaDeviceSynchronize();
    return 0;
}

warp矩阵函数：
C ++ warp矩阵操作利用Tensor Cores来加速D = A * B + C形式的矩阵问题。这需要一个warp中所有线程的协作。
这些warp矩阵函数是计算能力为7.0或更高的设备支持的预览功能。此处描述的数据结构和API在未来版本中可能会有所变化，并且可能与这些未来版本不兼容。
描述：
以下所有函数和类型都在命名空间nvcuda :: wmma中定义。

template<typename Use, int m, int n, int k, typename T, typename Layout = void>
class fragment;
template<> class fragment<matrix_a, 16, 16, 16, __half, row_major>
template<> class fragment<matrix_a, 16, 16, 16, __half, col_major>
template<> class fragment<matrix_b, 16, 16, 16, __half, row_major>
template<> class fragment<matrix_b, 16, 16, 16, __half, col_major>
template<> class fragment<accumulator, 16, 16, 16, __half>
template<> class fragment<accumulator, 16, 16, 16, float>
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm);
void load_matrix_sync(fragment<...> &a, const T* mptr, unsigned ldm, layout_t,layout);
void store_matrix_sync(T* mptr, const fragment<...> &a, unsigned ldm, layout_t
                            layout);
void fill_fragment(fragment<...> &a, const T& v);
void mma_sync(fragment<...> &d, const fragment<...> &a, const fragment<...>
                            &b, const fragment<...> &c, bool satf = false);

timg

CUDA学习（四十七）

热门文章

最新文章

相关课程

相关电子书