diff --git a/HC.make b/HC.make
new file mode 100644
index 0000000..a79acf4
--- /dev/null
+++ b/HC.make
@@ -0,0 +1,21 @@
+
+HCC = hcc
+
+CXXFLAGS+=-O3 $(shell hcc-config --cxxflags)
+LDFLAGS+=$(shell hcc-config --ldflags)
+
+ifdef TBSIZE
+CXXFLAGS+=-DVIRTUALTILESIZE=$(TBSIZE)
+endif
+
+ifdef NTILES
+CXXFLAGS+=-DNTILES=$(TBSIZE)
+endif
+
+
+hc-stream: main.cpp HCStream.cpp
+	$(HCC) $(CXXFLAGS) -DHC  $^  $(LDFLAGS) $(EXTRA_FLAGS) -o $@
+
+.PHONY: clean
+clean:
+	rm -f hc-stream
diff --git a/HCStream.cpp b/HCStream.cpp
new file mode 100644
index 0000000..b1b4a9b
--- /dev/null
+++ b/HCStream.cpp
@@ -0,0 +1,276 @@
+// Copyright (c) 2017 Peter Steinbach, MPI CBG Scientific Computing Facility
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#include "HCStream.h"
+
+#include <codecvt>
+#include <vector>
+#include <locale>
+#include <numeric>
+
+//specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115
+#ifndef VIRTUALTILESIZE
+#define VIRTUALTILESIZE 256
+#endif
+
+//specific sizes were obtained through experimentation using a Fiji R9 Nano with rocm 1.6-115
+#ifndef NTILES
+#define NTILES 2048
+#endif
+
+
+std::string getDeviceName(const hc::accelerator& _acc)
+{
+  std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t> converter;
+  std::string value = converter.to_bytes(_acc.get_description());
+  return value;
+}
+
+void listDevices(void)
+{
+  // Get number of devices
+  std::vector<hc::accelerator> accs = hc::accelerator::get_all();
+
+  // Print device names
+  if (accs.empty())
+  {
+    std::cerr << "No devices found." << std::endl;
+  }
+  else
+  {
+    std::cout << std::endl;
+    std::cout << "Devices:" << std::endl;
+    for (int i = 0; i < accs.size(); i++)
+    {
+      std::cout << i << ": " << getDeviceName(accs[i]) << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+
+template <class T>
+HCStream<T>::HCStream(const unsigned int ARRAY_SIZE, const int device_index):
+  array_size(ARRAY_SIZE),
+  d_a(ARRAY_SIZE),
+  d_b(ARRAY_SIZE),
+  d_c(ARRAY_SIZE)
+{
+
+  // The array size must be divisible by VIRTUALTILESIZE for kernel launches
+  if (ARRAY_SIZE % VIRTUALTILESIZE != 0)
+  {
+    std::stringstream ss;
+    ss << "Array size must be a multiple of " << VIRTUALTILESIZE;
+    throw std::runtime_error(ss.str());
+  }
+
+  // Set device
+  std::vector<hc::accelerator> accs = hc::accelerator::get_all();
+  auto current = accs.at(device_index);
+
+  hc::accelerator::set_default(current.get_device_path());
+
+  std::cout << "Using HC device " << getDeviceName(current) << std::endl;
+
+}
+
+
+template <class T>
+HCStream<T>::~HCStream()
+{
+}
+
+template <class T>
+void HCStream<T>::init_arrays(T _a, T _b, T _c)
+{
+  hc::array_view<T,1> view_a(this->d_a);
+  hc::array_view<T,1> view_b(this->d_b);
+  hc::array_view<T,1> view_c(this->d_c);
+
+  hc::completion_future future_a= hc::parallel_for_each(hc::extent<1>(array_size)
+                                , [=](hc::index<1> i) [[hc]] {
+                                  view_a[i] = _a;
+                                });
+
+  hc::completion_future future_b= hc::parallel_for_each(hc::extent<1>(array_size)
+                                                        , [=](hc::index<1> i) [[hc]] {
+                                                          view_b[i] = _b;
+                                                        });
+  hc::completion_future future_c= hc::parallel_for_each(hc::extent<1>(array_size)
+                                                        , [=](hc::index<1> i) [[hc]] {
+                                                          view_c[i] = _c;
+                                                        });
+  try{
+    future_a.wait();
+    future_b.wait();
+    future_c.wait();
+  }
+  catch(std::exception& e){
+    std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream<T>::init_arrays " << e.what() << std::endl;
+    throw;
+  }
+
+}
+
+template <class T>
+void HCStream<T>::read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c)
+{
+  hc::copy(d_a,a.begin());
+  hc::copy(d_b,b.begin());
+  hc::copy(d_c,c.begin());
+}
+
+
+template <class T>
+void HCStream<T>::copy()
+{
+
+  hc::array_view<T,1> view_a = this->d_a;
+  hc::array_view<T,1> view_c = this->d_c;
+
+  try{
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+                                , [=](hc::index<1> index) [[hc]] {
+                                  view_c[index] = view_a[index];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream<T>::copy " << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+void HCStream<T>::mul()
+{
+
+  const T scalar = startScalar;
+  hc::array_view<T,1> view_b = this->d_b;
+  hc::array_view<T,1> view_c = this->d_c;
+
+  try{
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+                                , [=](hc::index<1> i) [[hc]] {
+                                  view_b[i] = scalar*view_c[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream<T>::mul " << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+void HCStream<T>::add()
+{
+
+
+  hc::array_view<T,1> view_a(this->d_a);
+  hc::array_view<T,1> view_b(this->d_b);
+  hc::array_view<T,1> view_c(this->d_c);
+
+  try{
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+                                , [=](hc::index<1> i) [[hc]] {
+                                  view_c[i] = view_a[i]+view_b[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream<T>::add " << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+void HCStream<T>::triad()
+{
+
+  const T scalar = startScalar;
+  hc::array_view<T,1> view_a(this->d_a);
+  hc::array_view<T,1> view_b(this->d_b);
+  hc::array_view<T,1> view_c(this->d_c);
+
+  try{
+    hc::completion_future future_kernel = hc::parallel_for_each(hc::extent<1>(array_size)
+                                , [=](hc::index<1> i) [[hc]] {
+                                  view_a[i] = view_b[i] + scalar*view_c[i];
+								});
+    future_kernel.wait();
+  }
+  catch(std::exception& e){
+    std::cerr << __FILE__ << ":" << __LINE__ << "\t HCStream<T>::triad " << e.what() << std::endl;
+    throw;
+  }
+}
+
+template <class T>
+T HCStream<T>::dot()
+{
+   //implementation adapted from
+    //https://ampbook.codeplex.com/SourceControl/latest
+    // ->Samples/CaseStudies/Reduction
+    // ->CascadingReduction.h
+
+    const auto& view_a = this->d_a;
+    const auto& view_b = this->d_b;
+
+    auto ex = view_a.get_extent();
+    const auto tiled_ex = hc::extent<1>(NTILES * VIRTUALTILESIZE).tile(VIRTUALTILESIZE);
+    const auto domain_sz = tiled_ex.size();
+
+    hc::array<T, 1> partial(NTILES);
+
+    hc::parallel_for_each(tiled_ex,
+                          [=,
+                           &view_a,
+                           &view_b,
+                           &partial](const hc::tiled_index<1>& tidx) [[hc]] {
+
+                            auto gidx = tidx.global[0];
+        T r = T{0}; // Assumes reduction op is addition.
+        while (gidx < view_a.get_extent().size()) {
+            r += view_a[gidx] * view_b[gidx];
+            gidx += domain_sz;
+        }
+
+        tile_static T tileData[VIRTUALTILESIZE];
+        tileData[tidx.local[0]] = r;
+
+        tidx.barrier.wait_with_tile_static_memory_fence();
+
+        for (auto h = VIRTUALTILESIZE / 2; h; h /= 2) {
+            if (tidx.local[0] < h) {
+                tileData[tidx.local[0]] += tileData[tidx.local[0] + h];
+            }
+            tidx.barrier.wait_with_tile_static_memory_fence();
+        }
+
+        if (tidx.global == tidx.tile_origin) partial[tidx.tile] = tileData[0];
+    });
+
+    try {
+        partial.get_accelerator_view().wait();
+    }
+    catch (std::exception& e) {
+        std::cerr << __FILE__ << ":" << __LINE__ << "\t  HCStream<T>::dot " << e.what() << std::endl;
+        throw;
+    }
+
+    std::vector<T> h_partial(NTILES,0);
+    hc::copy(partial,h_partial.begin());
+
+    T result = std::accumulate(h_partial.begin(), h_partial.end(), 0.);
+
+    return result;
+
+
+}
+
+template class HCStream<float>;
+template class HCStream<double>;
diff --git a/HCStream.h b/HCStream.h
new file mode 100644
index 0000000..4bc2b18
--- /dev/null
+++ b/HCStream.h
@@ -0,0 +1,46 @@
+
+// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith,
+// University of Bristol HPC
+//
+// For full license terms please see the LICENSE file distributed with this
+// source code
+
+#pragma once
+
+#include <iostream>
+#include <stdexcept>
+#include <sstream>
+
+#include "Stream.h"
+#include "hc.hpp"
+
+#define IMPLEMENTATION_STRING "HC"
+
+template <class T>
+class HCStream : public Stream<T>
+{
+protected:
+  // Size of arrays
+  unsigned int array_size;
+  // Device side pointers to arrays
+  hc::array<T,1> d_a;
+  hc::array<T,1> d_b;
+  hc::array<T,1> d_c;
+
+
+public:
+
+  HCStream(const unsigned int, const int);
+  ~HCStream();
+
+  virtual void copy() override;
+  virtual void add() override;
+  virtual void mul() override;
+  virtual void triad() override;
+  virtual T dot() override;
+  T dot_impl();
+
+  virtual void init_arrays(T initA, T initB, T initC) override;
+  virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
+
+};
diff --git a/main.cpp b/main.cpp
index 33cef1e..8a2108c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -23,6 +23,8 @@
 #include "CUDAStream.h"
 #elif defined(HIP)
 #include "HIPStream.h"
+#elif defined(HC)
+#include "HCStream.h"
 #elif defined(OCL)
 #include "OCLStream.h"
 #elif defined(USE_RAJA)
@@ -105,6 +107,10 @@ void run()
   // Use the HIP implementation
   stream = new HIPStream<T>(ARRAY_SIZE, deviceIndex);
 
+#elif defined(HC)
+  // Use the HC implementation
+  stream = new HCStream<T>(ARRAY_SIZE, deviceIndex);
+
 #elif defined(OCL)
   // Use the OpenCL implementation
   stream = new OCLStream<T>(ARRAY_SIZE, deviceIndex);