reference1 : https://gist.github.com/donny-dont/1471329
reference2 : http://blogs.msdn.com/b/vcblog/archive/2008/08/28/the-mallocator.aspx
#ifdef _WIN32
#include <malloc.h>
#endif
#include <cstdint>
#include <vector>
#include <iostream>
/**
* Allocator for aligned data.
*
* Modified from the Mallocator from Stephan T. Lavavej.
* <http://blogs.msdn.com/b/vcblog/archive/2008/08/28/the-mallocator.aspx>
*/
template <typename T, std::size_t Alignment>
class aligned_allocator
{
public:
// The following will be the same for virtually all allocators.
typedef T * pointer;
typedef const T * const_pointer;
typedef T& reference;
typedef const T& const_reference;
typedef T value_type;
typedef std::size_t size_type;
typedef ptrdiff_t difference_type;
T * address(T& r) const
{
return &r;
}
const T * address(const T& s) const
{
return &s;
}
std::size_t max_size() const
{
// The following has been carefully written to be independent of
// the definition of size_t and to avoid signed/unsigned warnings.
return (static_cast<std::size_t>(0) - static_cast<std::size_t>(1)) / sizeof(T);
}
// The following must be the same for all allocators.
template <typename U>
struct rebind
{
typedef aligned_allocator<U, Alignment> other;
} ;
bool operator!=(const aligned_allocator& other) const
{
return !(*this == other);
}
void construct(T * const p, const T& t) const
{
void * const pv = static_cast<void *>(p);
new (pv) T(t);
}
void destroy(T * const p) const
{
p->~T();
}
// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const aligned_allocator& other) const
{
return true;
}
// Default constructor, copy constructor, rebinding constructor, and destructor.
// Empty for stateless allocators.
aligned_allocator() { }
aligned_allocator(const aligned_allocator&) { }
template <typename U> aligned_allocator(const aligned_allocator<U, Alignment>&) { }
~aligned_allocator() { }
// The following will be different for each allocator.
T * allocate(const std::size_t n) const
{
// The return value of allocate(0) is unspecified.
// Mallocator returns NULL in order to avoid depending
// on malloc(0)'s implementation-defined behavior
// (the implementation can define malloc(0) to return NULL,
// in which case the bad_alloc check below would fire).
// All allocators can return NULL in this case.
if (n == 0) {
return NULL;
}
// All allocators should contain an integer overflow check.
// The Standardization Committee recommends that std::length_error
// be thrown in the case of integer overflow.
if (n > max_size())
{
throw std::length_error("aligned_allocator<T>::allocate() - Integer overflow.");
}
// Mallocator wraps malloc().
void * const pv = _mm_malloc(n * sizeof(T), Alignment);
// Allocators should throw std::bad_alloc in the case of memory allocation failure.
if (pv == NULL)
{
throw std::bad_alloc();
}
return static_cast<T *>(pv);
}
void deallocate(T * const p, const std::size_t n) const
{
_mm_free(p);
}
// The following will be the same for all allocators that ignore hints.
template <typename U>
T * allocate(const std::size_t n, const U * /* const hint */) const
{
return allocate(n);
}
// Allocators are not required to be assignable, so
// all allocators should have a private unimplemented
// assignment operator. Note that this will trigger the
// off-by-default (enabled under /Wall) warning C4626
// "assignment operator could not be generated because a
// base class assignment operator is inaccessible" within
// the STL headers, but that warning is useless.
private:
aligned_allocator& operator=(const aligned_allocator&);
};
int main()
{
typedef std::vector<__m128, aligned_allocator<__m128, sizeof(__m128)> > aligned_vector;
aligned_vector lhs;
aligned_vector rhs;
float a = 1.0f;
float b = 2.0f;
float c = 3.0f;
float d = 4.0f;
float e = 5.0f;
float f = 6.0f;
float g = 7.0f;
float h = 8.0f;
for (std::size_t i = 0; i < 1000; ++i)
{
lhs.push_back(_mm_set_ps(a, b, c, d));
rhs.push_back(_mm_set_ps(e, f, g, h));
a += 1.0f; b += 1.0f; c += 1.0f; d += 1.0f;
e += 1.0f; f += 1.0f; g += 1.0f; h += 1.0f;
}
__m128 mul = _mm_mul_ps(lhs[10], rhs[10]);
}