Skip to content

Commit

Permalink
resolve issue #171 regarding SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
masajiro committed Oct 31, 2024
1 parent fae388a commit f07d824
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 210 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.3.2
2.3.3
209 changes: 0 additions & 209 deletions lib/NGT/NGTQ/Quantizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -1287,73 +1287,6 @@ class ScalarQuantizedInt8ObjectProcessingStream {
const std::type_info* dataTypeInfo;
};

class ScalarQuantizedUint8TransposedObjectProcessingStream : public ScalarQuantizedInt8ObjectProcessingStream {
public:
ScalarQuantizedUint8TransposedObjectProcessingStream(size_t dimension, size_t nOfObjects, NGTQ::Quantizer &q):
ScalarQuantizedInt8ObjectProcessingStream(q) {
initialize(dimension, nOfObjects);
stream = new uint8_t[streamSize]();
}

ScalarQuantizedUint8TransposedObjectProcessingStream(size_t dim):
ScalarQuantizedInt8ObjectProcessingStream(*reinterpret_cast<NGTQ::Quantizer*>(0)) {
initialize(dim, 0);
stream = 0;
}

void initialize(size_t dim, size_t nOfObjects) {
dimension = dim;
numOfObjects = nOfObjects;
smallBlockSize = 16;
blockSize = 64;
lineSize = blockSize * nOfObjects;
setStreamSize();
}

void arrangeObject(size_t dataNo, std::vector<float> &object, float scale, float offset, bool shift) {
if (dataNo >= numOfObjects) {
std::stringstream msg;
msg << "The data index is out of the range. " << dataNo << ":" << numOfObjects;
NGTThrowException(msg);
}
NGT::ObjectSpace::quantizeToQint8(object, *dataTypeInfo, dimension, offset, scale, shift);
size_t endOfBlock = ((object.size() - 1) / blockSize) * blockSize;
for (size_t i = 0; i < endOfBlock; i++) {
size_t idx = lineSize * (i / blockSize) + blockSize * dataNo + i % blockSize;
if (idx >= streamSize) {
std::stringstream msg;
msg << "Fatal inner error! (1) " << idx << ":" << streamSize << " "
<< dataNo << ":" << i << "/" << object.size();
NGTThrowException(msg);
}
stream[idx] = static_cast<uint8_t>(object[i]);
}
for (size_t i = endOfBlock; i < object.size(); i++) {
size_t idx = lineSize * (i / blockSize) + (i - endOfBlock) / smallBlockSize * (smallBlockSize * numOfObjects) + smallBlockSize * dataNo + (i - endOfBlock) % smallBlockSize;
if (idx >= streamSize) {
std::stringstream msg;
msg << "Fatal inner error! (2) " << idx << ":" << streamSize << " "
<< dataNo << ":" << i << "/" << object.size();
NGTThrowException(msg);
}
stream[idx] = static_cast<uint8_t>(object[i]);
}
}

void setStreamSize() {
streamSize = ((dimension - 1) / smallBlockSize + 1) * smallBlockSize * numOfObjects;
return;
}

size_t getStreamSize(size_t nOfObjects) {
numOfObjects = nOfObjects;
setStreamSize();
return streamSize;
}
size_t smallBlockSize;
size_t blockSize;
size_t lineSize;
};
/////

class QuantizedObjectDistance {
Expand Down Expand Up @@ -3048,148 +2981,6 @@ class ScalarQuantizedInt8ObjectDistance : public NonLocalQuantizedObjectDistance

};

class ScalarQuantizedUint8TransposedObjectDistance : public ScalarQuantizedInt8ObjectDistance<NGT::quint8, NGT::quint8> {
public:
ScalarQuantizedUint8TransposedObjectDistance(Quantizer &q, DistanceType dtype):ScalarQuantizedInt8ObjectDistance<NGT::quint8, NGT::quint8>(q, dtype) {}
~ScalarQuantizedUint8TransposedObjectDistance() {}

#ifdef NGTQBG_MIN
inline float operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query) {
#else
inline void operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query) {
#endif
if (query == 0) {
NGTThrowException("Fatal inner error! The specified query is invalid..");
}
const uint8_t *object = static_cast<uint8_t*>(inv);
const uint8_t *qobject = static_cast<uint8_t*>(query);
const unsigned char *last = qobject + dimension;
__m256i sum256[noOfObjects];
#if defined(NGT_AVX512)
{
__m512i sum512[noOfObjects];
for (size_t oi = 0; oi < noOfObjects; oi++) {
sum512[oi] = _mm512_setzero_si512();
}
const unsigned char *lastgroup = last - 63;
while (qobject < lastgroup) {
__m512i q = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(qobject));
for (size_t oi = 0; oi < noOfObjects; oi++) {
__m512i o = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(object));
__mmask64 m = _mm512_cmplt_epu8_mask(q, o);
__m512i x = _mm512_add_epi8(_mm512_maskz_subs_epu8(m, o, q),
_mm512_maskz_subs_epu8(~m, q, o));
__m512i xi16 = _mm512_cvtepu8_epi16(_mm512_extracti32x8_epi32(x,0));
sum512[oi] = _mm512_add_epi32(sum512[oi], _mm512_madd_epi16(xi16, xi16));
xi16 = _mm512_cvtepu8_epi16(_mm512_extracti32x8_epi32(x,1));
sum512[oi] = _mm512_add_epi32(sum512[oi], _mm512_madd_epi16(xi16, xi16));
object += 64;
}
qobject += 64;
}
for (size_t oi = 0; oi < noOfObjects; oi++) {
sum256[oi] = _mm256_add_epi32(_mm512_extracti32x8_epi32(sum512[oi], 0),
_mm512_extracti32x8_epi32(sum512[oi], 1));
}
}
#elif defined(NGT_AVX2)
{
for (size_t oi = 0; oi < noOfObjects; oi++) {
sum256[oi] = _mm256_setzero_si256();
}
//__attribute__((aligned(32))) uint16_t iv[16];
const unsigned char *lastgroup = last - 63;
while (qobject < lastgroup) {
//std::cerr << "qobj=" << (size_t)qobject << ":" << (size_t)lastgroup << std::endl;
__m256i q[4];
q[0] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
qobject += 16;
q[1] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
qobject += 16;
q[2] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
qobject += 16;
q[3] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
qobject += 16;
for (size_t oi = 0; oi < noOfObjects; oi++) {
__m256i o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
__m256i sub16 = _mm256_subs_epi16(q[0], o);
sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
object += 16;
o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
sub16 = _mm256_subs_epi16(q[1], o);
sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
object += 16;
o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
sub16 = _mm256_subs_epi16(q[2], o);
sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
object += 16;
o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
sub16 = _mm256_subs_epi16(q[3], o);
sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
object += 16;
}
}
}
#endif
{
//__attribute__((aligned(32))) uint16_t iv[16];
const unsigned char *lastgroup = last - 15;
while (qobject < lastgroup) {
__m256i q = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
for (size_t oi = 0; oi < noOfObjects; oi++) {
__m256i o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
__m256i sub16 = _mm256_subs_epi16(q, o);
sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
object += 16;
}
qobject += 16;
}
}
#ifdef NGTQBG_MIN
float min = std::numeric_limits<float>::max();
#endif
const __m256i value0 = _mm256_set1_epi32(0);
for (size_t oi = 0; oi < noOfObjects; oi++) {
__m256i tmp1 = _mm256_hadd_epi32(sum256[oi], value0);
__m256i tmp2 = _mm256_hadd_epi32(tmp1, value0);
distances[oi] = _mm256_extract_epi32(tmp2, 0) + _mm256_extract_epi32(tmp2, 4);
if (distances[oi] < min) {
min = distances[oi];
}
}
#ifdef NGTQBG_MIN
return min;
#endif
}

#ifdef NGTQBG_MIN
inline float operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query, std::vector<uint32_t> &queryList) {
return 0.0;
#else
inline void operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query, std::vector<uint32_t> &queryList) {
#endif
}

uint8_t *generateRearrangedObjects(NGTQ::InvertedIndexEntry<uint16_t> &invertedIndexObjects) {
if (invertedIndexObjects.numOfSubvectors != localDivisionNo) {
std::stringstream msg;
msg << "Internal fatal error. Invalid # of subvectos. " << invertedIndexObjects.numOfSubvectors << ":" << localDivisionNo;
NGTThrowException(msg);
}
ScalarQuantizedUint8TransposedObjectProcessingStream processingStream(invertedIndexObjects.numOfSubvectors, invertedIndexObjects.size(), quantizer);
processingStream.arrange(invertedIndexObjects);
return processingStream.getStream();
}
void restoreIntoInvertedIndex(NGTQ::InvertedIndexEntry<uint16_t> &invertedIndexObjects,
size_t numOfSubspaces, std::vector<uint32_t> &ids, void *objects) {
NGTThrowException("not implemented");
}
size_t getSizeOfCluster(size_t noOfObjects) {
ScalarQuantizedUint8TransposedObjectProcessingStream processingStream(localDivisionNo);
return processingStream.getStreamSize(noOfObjects);;
}

};

#endif

Expand Down

0 comments on commit f07d824

Please sign in to comment.