Sorting with CUDA libraries

In this post, we will see two possibilities to perform sorting in CUDA using libraries. The first is using Thrust, while the second is CUB, which has several “device-wide” primitives.
Thrust uses radix-sorting for fundamental data types when using the already defined comparison operations, see  Thrust: A Productivity-Oriented Library for CUDA.
Opposite to that, when one needs to sort objects by defining customized struct comparison operators, Thrust will switch to merge-sort. Radix-sort is remarkably faster than merge-sort, so the recommendation is using built-in comparison operators when possible.

Besides Thrust, it is necessary to also mention CUB which has DeviceRadixSort which provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory.

Below an example showing the use of both libraries to perform sorting is reported.
It should be noticed that cub::DeviceRadixSort performs a “sort-by-key”, while Thrust has both thrust::sort and thrust::sort_by_key available.
Of course, CUB cub::DeviceRadixSort can be used for simple sorting by making the keys array coincide with the values array.
The commented lines of the code below provide simple sorting for CUB or sorting by key for Thrust.

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>

#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

// for printing
#include <thrust/copy.h>
#include <ostream>

#define STRIDE 2
#define N 100

#define pi_f  3.14159265358979f                    // Greek pi in single precision

struct sin_functor
        __host__ __device__
        float operator()(float x) const
            return sin(2.f*pi_f*x);

template <typename Iterator>
class strided_range

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
            return stride * i;

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
        : first(first), last(last), stride(stride) {}

    iterator begin(void) const
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));

    iterator end(void) const
        return begin() + ((last - first) + (stride - 1)) / stride;

    Iterator first;
    Iterator last;
    difference_type stride;

int main(void)
    // --- Generate the integration coefficients
    thrust::host_vector<float> h_coefficients(STRIDE);
    h_coefficients[0] = 4.f;
    h_coefficients[1] = 2.f;

    thrust::device_vector<float> d_coefficients(N);

    typedef thrust::device_vector<float>::iterator Iterator;
    strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
    strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);

    thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
    thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);

    d_coefficients[0]        = 1.f;
    d_coefficients[N-1]        = 1.f;

    // print the generated d_coefficients
    std::cout << "d_coefficients: ";
    thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " "));  std::cout << std::endl;

    // --- Generate sampling points
    float a     = 0.f;
    float b     = .5f;

    float Dx    = (b-a)/(float)(N-1);

    thrust::device_vector<float> d_x(N);


    for(int i=0; i<N; i++) {
        float val = d_x[i];
        printf("%d %fn", i, val);

    // --- Calculate function values
    thrust::device_vector<float> d_y(N);

    thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());

    for(int i=0; i<N; i++) {
        float val = d_y[i];
        printf("%d %fn", i, val);

    // --- Calculate integral
    float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);

    printf("The integral is = %fn", integral);


    return 0;

Leave a Reply

Your email address will not be published. Required fields are marked *