• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===-- Test for the parallel scan and reduction operations on the GPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "src/__support/CPP/bit.h"
10 #include "src/__support/GPU/utils.h"
11 #include "test/IntegrationTest/test.h"
12 
13 using namespace LIBC_NAMESPACE;
14 
sum(uint32_t n)15 static uint32_t sum(uint32_t n) { return n * (n + 1) / 2; }
16 
17 // Tests a reduction within a convergant warp or wavefront using some known
18 // values. For example, if every element in the lane is one, then the sum should
19 // be the size of the warp or wavefront, i.e. 1 + 1 + 1 ... + 1.
test_reduce()20 static void test_reduce() {
21   uint64_t mask = gpu::get_lane_mask();
22   uint32_t x = gpu::reduce(mask, 1);
23   EXPECT_EQ(x, gpu::get_lane_size());
24 
25   uint32_t y = gpu::reduce(mask, gpu::get_lane_id());
26   EXPECT_EQ(y, sum(gpu::get_lane_size() - 1));
27 
28   uint32_t z = 0;
29   if (gpu::get_lane_id() % 2)
30     z = gpu::reduce(gpu::get_lane_mask(), 1);
31   gpu::sync_lane(mask);
32 
33   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_size() / 2 : 0);
34 }
35 
36 // Tests an accumulation scan within a convergent warp or wavefront using some
37 // known values. For example, if every element in the lane is one, then the scan
38 // should have each element be equivalent to its ID, i.e. 1, 1 + 1, ...
test_scan()39 static void test_scan() {
40   uint64_t mask = gpu::get_lane_mask();
41 
42   uint32_t x = gpu::scan(mask, 1);
43   EXPECT_EQ(x, gpu::get_lane_id() + 1);
44 
45   uint32_t y = gpu::scan(mask, gpu::get_lane_id());
46   EXPECT_EQ(y, sum(gpu::get_lane_id()));
47 
48   uint32_t z = 0;
49   if (gpu::get_lane_id() % 2)
50     z = gpu::scan(gpu::get_lane_mask(), 1);
51   gpu::sync_lane(mask);
52 
53   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
54 }
55 
random(uint64_t * rand_next)56 static uint32_t random(uint64_t *rand_next) {
57   uint64_t x = *rand_next;
58   x ^= x >> 12;
59   x ^= x << 25;
60   x ^= x >> 27;
61   *rand_next = x;
62   return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
63 }
64 
65 // Scan operations can break down under thread divergence, make sure that the
66 // function works under some random divergence. We do this by trivially
67 // implementing a scan with shared scratch memory and then comparing the
68 // results.
test_scan_divergent()69 static void test_scan_divergent() {
70   static uint32_t input[64] = {0};
71   static uint32_t result[64] = {0};
72   uint64_t state = gpu::processor_clock() + __gpu_lane_id();
73 
74   for (int i = 0; i < 64; ++i) {
75     uint64_t lanemask = gpu::get_lane_mask();
76     if (random(&state) & (1ull << gpu::get_lane_id())) {
77       uint64_t divergent = gpu::get_lane_mask();
78       uint32_t value = random(&state) % 256;
79       input[gpu::get_lane_id()] = value;
80 
81       if (gpu::is_first_lane(divergent)) {
82         uint32_t accumulator = 0;
83         for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
84           uint32_t tmp = input[lane];
85           result[lane] = tmp + accumulator;
86           accumulator += tmp;
87         }
88       }
89       gpu::sync_lane(divergent);
90 
91       uint32_t scan = gpu::scan(divergent, value);
92       EXPECT_EQ(scan, result[gpu::get_lane_id()]);
93     }
94     if (gpu::is_first_lane(lanemask))
95       __builtin_memset(input, 0, sizeof(input));
96     gpu::sync_lane(lanemask);
97   }
98 }
99 
TEST_MAIN(int argc,char ** argv,char ** envp)100 TEST_MAIN(int argc, char **argv, char **envp) {
101   if (gpu::get_thread_id() >= gpu::get_lane_size())
102     return 0;
103 
104   test_reduce();
105 
106   test_scan();
107 
108   test_scan_divergent();
109 
110   return 0;
111 }
112