resolve issue #171 regarding SIMD

yahoojapan · Oct 31, 2024 · f07d824 · f07d824
1 parent fae388a
commit f07d824
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 210 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.3.2
+2.3.3
diff --git a/lib/NGT/NGTQ/Quantizer.h b/lib/NGT/NGTQ/Quantizer.h
@@ -1287,73 +1287,6 @@ class ScalarQuantizedInt8ObjectProcessingStream {
   const std::type_info* dataTypeInfo;
 };
 
-class ScalarQuantizedUint8TransposedObjectProcessingStream : public ScalarQuantizedInt8ObjectProcessingStream {
- public:
-  ScalarQuantizedUint8TransposedObjectProcessingStream(size_t dimension, size_t nOfObjects, NGTQ::Quantizer &q):
-   ScalarQuantizedInt8ObjectProcessingStream(q) {
-    initialize(dimension, nOfObjects);
-    stream = new uint8_t[streamSize]();
-  }
-
-  ScalarQuantizedUint8TransposedObjectProcessingStream(size_t dim):
-   ScalarQuantizedInt8ObjectProcessingStream(*reinterpret_cast<NGTQ::Quantizer*>(0)) {
-    initialize(dim, 0);
-    stream = 0;
-  }
-
-  void initialize(size_t dim, size_t nOfObjects) {
-    dimension = dim;
-    numOfObjects = nOfObjects;
-    smallBlockSize = 16;
-    blockSize = 64;
-    lineSize = blockSize * nOfObjects;
-    setStreamSize();
-  }
-
-  void arrangeObject(size_t dataNo, std::vector<float> &object, float scale, float offset, bool shift) {
-    if (dataNo >= numOfObjects) {
-      std::stringstream msg;
-      msg << "The data index is out of the range. " << dataNo << ":" << numOfObjects;
-      NGTThrowException(msg);
-    }
-    NGT::ObjectSpace::quantizeToQint8(object, *dataTypeInfo, dimension, offset, scale, shift);
-    size_t endOfBlock = ((object.size() - 1) / blockSize) * blockSize;
-    for (size_t i = 0; i < endOfBlock; i++) {
-      size_t idx = lineSize * (i / blockSize) + blockSize * dataNo + i % blockSize;
-      if (idx >= streamSize) {
-	std::stringstream msg;
-	msg << "Fatal inner error! (1) " << idx << ":" << streamSize << " " 
-	    << dataNo << ":" << i << "/" << object.size();
-	NGTThrowException(msg);
-      }
-      stream[idx] = static_cast<uint8_t>(object[i]);
-    }
-    for (size_t i = endOfBlock; i < object.size(); i++) {
-      size_t idx = lineSize * (i / blockSize) + (i - endOfBlock) / smallBlockSize * (smallBlockSize * numOfObjects) + smallBlockSize * dataNo + (i - endOfBlock) % smallBlockSize;
-      if (idx >= streamSize) {
-	std::stringstream msg;
-	msg << "Fatal inner error! (2) " << idx << ":" << streamSize << " " 
-	    << dataNo << ":" << i << "/" << object.size();
-	NGTThrowException(msg);
-      }
-      stream[idx] = static_cast<uint8_t>(object[i]);
-    }
-  }
-
-  void setStreamSize() {
-    streamSize = ((dimension - 1) / smallBlockSize + 1) * smallBlockSize * numOfObjects;
-    return;
-  }
-
-  size_t getStreamSize(size_t nOfObjects) {
-    numOfObjects = nOfObjects;
-    setStreamSize();
-    return streamSize;
-  }
-  size_t smallBlockSize;
-  size_t blockSize;
-  size_t lineSize;
-};
 ///// 
 
 class QuantizedObjectDistance {
@@ -3048,148 +2981,6 @@ class ScalarQuantizedInt8ObjectDistance : public NonLocalQuantizedObjectDistance
 
 };
 
-class ScalarQuantizedUint8TransposedObjectDistance : public ScalarQuantizedInt8ObjectDistance<NGT::quint8, NGT::quint8> {
- public:
-  ScalarQuantizedUint8TransposedObjectDistance(Quantizer &q, DistanceType dtype):ScalarQuantizedInt8ObjectDistance<NGT::quint8, NGT::quint8>(q, dtype) {}
-  ~ScalarQuantizedUint8TransposedObjectDistance() {}
-
-#ifdef NGTQBG_MIN
-  inline float operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query) {
-#else
-  inline void operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query) {
-#endif
-    if (query == 0) {
-      NGTThrowException("Fatal inner error! The specified query is invalid..");
-    }
-    const uint8_t *object = static_cast<uint8_t*>(inv);
-    const uint8_t *qobject = static_cast<uint8_t*>(query);
-    const unsigned char *last = qobject + dimension;
-    __m256i sum256[noOfObjects];
-#if defined(NGT_AVX512)
-    {
-      __m512i sum512[noOfObjects];
-      for (size_t oi = 0; oi < noOfObjects; oi++) {
-	sum512[oi] = _mm512_setzero_si512();
-      }
-      const unsigned char *lastgroup = last - 63;
-      while (qobject < lastgroup) {
-	__m512i q = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(qobject));
-	for (size_t oi = 0; oi < noOfObjects; oi++) {
-	  __m512i o = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(object));
-	  __mmask64 m = _mm512_cmplt_epu8_mask(q, o);
-	  __m512i x = _mm512_add_epi8(_mm512_maskz_subs_epu8(m, o, q),
-				      _mm512_maskz_subs_epu8(~m, q, o));
-	  __m512i xi16 = _mm512_cvtepu8_epi16(_mm512_extracti32x8_epi32(x,0));
-	  sum512[oi] = _mm512_add_epi32(sum512[oi], _mm512_madd_epi16(xi16, xi16));
-	  xi16 = _mm512_cvtepu8_epi16(_mm512_extracti32x8_epi32(x,1));
-	  sum512[oi] = _mm512_add_epi32(sum512[oi], _mm512_madd_epi16(xi16, xi16));
-	  object += 64;
-        }
-        qobject += 64;
-      }
-      for (size_t oi = 0; oi < noOfObjects; oi++) {
-	sum256[oi] = _mm256_add_epi32(_mm512_extracti32x8_epi32(sum512[oi], 0),
-				      _mm512_extracti32x8_epi32(sum512[oi], 1));
-      }
-    }
-#elif defined(NGT_AVX2)
-    {
-      for (size_t oi = 0; oi < noOfObjects; oi++) {
-	sum256[oi] = _mm256_setzero_si256();
-      }
-      //__attribute__((aligned(32))) uint16_t iv[16];
-      const unsigned char *lastgroup = last - 63;
-      while (qobject < lastgroup) {
-	//std::cerr << "qobj=" << (size_t)qobject << ":" << (size_t)lastgroup << std::endl;
-	__m256i q[4];
-	q[0] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
-	qobject += 16;
-	q[1] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
-	qobject += 16;
-	q[2] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
-	qobject += 16;
-	q[3] = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
-	qobject += 16;
-	for (size_t oi = 0; oi < noOfObjects; oi++) {
-	  __m256i o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
-	  __m256i sub16 = _mm256_subs_epi16(q[0], o);
-	  sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
-	  object += 16;
-	  o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
-	  sub16 = _mm256_subs_epi16(q[1], o);
-	  sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
-	  object += 16;
-	  o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
-	  sub16 = _mm256_subs_epi16(q[2], o);
-	  sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
-	  object += 16;
-	  o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
-	  sub16 = _mm256_subs_epi16(q[3], o);
-	  sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
-	  object += 16;
-	}
-      }
-    }
-#endif
-    {
-      //__attribute__((aligned(32))) uint16_t iv[16];
-      const unsigned char *lastgroup = last - 15;
-      while (qobject < lastgroup) {
-	__m256i q = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)qobject));
-	for (size_t oi = 0; oi < noOfObjects; oi++) {
-	  __m256i o = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const*)object));
-	  __m256i sub16 = _mm256_subs_epi16(q, o);
-	  sum256[oi] = _mm256_add_epi32(sum256[oi], _mm256_madd_epi16(sub16, sub16));
-	  object += 16;
-	}
-	qobject += 16;
-      }
-    }
-#ifdef NGTQBG_MIN
-    float min = std::numeric_limits<float>::max();
-#endif
-    const __m256i value0 = _mm256_set1_epi32(0);
-    for (size_t oi = 0; oi < noOfObjects; oi++) {
-      __m256i tmp1 = _mm256_hadd_epi32(sum256[oi], value0);
-      __m256i tmp2 = _mm256_hadd_epi32(tmp1, value0);
-      distances[oi] = _mm256_extract_epi32(tmp2, 0) + _mm256_extract_epi32(tmp2, 4);
-      if (distances[oi] < min) {
-	min = distances[oi];
-      }
-    }
-#ifdef NGTQBG_MIN
-    return min;
-#endif
-  }
-
-#ifdef NGTQBG_MIN
-  inline float operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query, std::vector<uint32_t> &queryList) {
-    return 0.0;
-#else
-  inline void operator()(void *inv, float *distances, size_t noOfObjects, DistanceLookupTableUint8 &distanceLUT, void *query, std::vector<uint32_t> &queryList) {
-#endif
-  }
-
-  uint8_t *generateRearrangedObjects(NGTQ::InvertedIndexEntry<uint16_t> &invertedIndexObjects) {
-    if (invertedIndexObjects.numOfSubvectors != localDivisionNo) {
-      std::stringstream msg;
-      msg << "Internal fatal error. Invalid # of subvectos. " << invertedIndexObjects.numOfSubvectors << ":" << localDivisionNo;
-      NGTThrowException(msg);
-    }
-    ScalarQuantizedUint8TransposedObjectProcessingStream processingStream(invertedIndexObjects.numOfSubvectors, invertedIndexObjects.size(), quantizer);
-    processingStream.arrange(invertedIndexObjects);
-    return processingStream.getStream();
-  }
-  void restoreIntoInvertedIndex(NGTQ::InvertedIndexEntry<uint16_t> &invertedIndexObjects,
-				size_t numOfSubspaces, std::vector<uint32_t> &ids, void *objects) {
-    NGTThrowException("not implemented");
-  }
-  size_t getSizeOfCluster(size_t noOfObjects) {
-    ScalarQuantizedUint8TransposedObjectProcessingStream processingStream(localDivisionNo);
-    return processingStream.getStreamSize(noOfObjects);;
-  }
-
-};
 
 #endif