libmemcachedにlibhashkitなんていう便利そうなライブラリがついていたので、各種ハッシュアルゴリズムの特性を調べてみた。
http://libmemcached.org/libMemcached.html
libhashkitでつかえるアルゴリズムは以下の通り。(libhashkit/types.h参照)
それぞれのアルゴリズムの説明は検索すればでてくるので省略。
- HASHKIT_HASH_MD5,
- HASHKIT_HASH_CRC,
- HASHKIT_HASH_FNV1_64,
- HASHKIT_HASH_FNV1A_64,
- HASHKIT_HASH_FNV1_32,
- HASHKIT_HASH_FNV1A_32,
- HASHKIT_HASH_HSIEH,
- HASHKIT_HASH_MURMUR,
- HASHKIT_HASH_JENKINS,
テスト方法は、互いに異なる文字列のdigest値をそれぞれ計算して、conflictが発生したかどうか、標準偏差、値の分布のヒストグラムをダンプするだけ。
作成したコードは以下の通り。
#include <boost/lambda/lambda.hpp>
#include <boost/lambda/bind.hpp>
#include <libhashkit/hashkit.h>
#include <iostream>
#include <algorithm>
#include <vector>
#include <set>
#include <map>
#include <stdlib.h>
#include <math.h>
using namespace std;
//#define TEST_RAND
#define TEST_CASES 10000
#define LETTER_NUM 62
static char letters[LETTER_NUM+1] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
static string rand_str(int len, unsigned int seed) {
char buf[len+1];
srand(seed + 1);
for (int i = 0; i < len; i++) {
buf[i] = letters[(int)((double)LETTER_NUM * rand() / (RAND_MAX + 1.0))];
}
buf[len] = '\0';
return string(buf);
}
static set<string> rand_strings;
static void gen_rand_strings(int n)
{
rand_strings.erase(rand_strings.begin(), rand_strings.end());
for (int i = 0; i < n; ++i) {
string s = rand_str(10, i);
if (!rand_strings.insert(s).second) {
cerr << "<" << s << "> is known!" << endl;
++n;
}
}
}
class TestHash {
private:
multiset<uint32_t> results;
Hashkit h;
hashkit_hash_algorithm_t hash_type;
int cases;
void init(void) {
results.erase(results.begin(), results.end());
}
public:
TestHash(int n, hashkit_hash_algorithm_t t) : cases(n), hash_type(t) {
h.set_function(t);
}
void byRand(void) {
init();
for (set<string>::iterator it = rand_strings.begin(); it != rand_strings.end(); ++it) {
string s = *it;
results.insert(h.digest(s));
}
}
void bySeq(void) {
init();
char buf[256];
for (int i = 0; i < cases; ++i) {
snprintf(buf, 256, "%010d", i);
string s(buf);
results.insert(h.digest(s));
}
}
int conflict(void) {
uint32_t last = -1;
int c = 0;
for (multiset<uint32_t>::iterator it = results.begin(); it != results.end(); ++it) {
if (last == *it)
++c;
last = *it;
}
return c;
}
double average(void) {
double sum = 0.0;
for_each(results.begin(), results.end(), sum += boost::lambda::_1);
return sum / cases;
}
double variance() {
double avr = average();
double sum = 0.0;
for_each(results.begin(), results.end(), sum += bind(pow, avr - boost::lambda::_1, 2));
return sum / cases;
}
double deviation(void) {
return sqrt(variance());
}
vector<int> histogram(int n) {
uint32_t max = *max_element(results.begin(), results.end());
vector<int> ret;
ret.assign(n+1, 0);
for (multiset<uint32_t>::iterator it = results.begin(); it != results.end(); ++it) {
ret[(int)((double) *it * n / max)] += 1;
}
ret.pop_back(); // remove max element
return ret;
}
void dump(void) {
cout << "conflict: " << conflict() << endl;
// cout << "average: " << average() << endl;
// cout << "variance: " << variance() << endl;
cout << "deviation: " << (long)deviation() << endl;
vector<int> ret = histogram(10);
cout << "histogram: " << endl;
for_each(ret.begin(), ret.end(), cout << boost::lambda::_1 << "\n");
}
};
void exec_test(pair<hashkit_hash_algorithm_t, string> info)
{
TestHash test(TEST_CASES, info.first);
cout << ">>" << info.second << endl;
#ifdef TEST_RAND
test.byRand();
test.dump();
#else
test.bySeq();
test.dump();
#endif
cout << endl;;
}
int main(void)
{
map<hashkit_hash_algorithm_t, string> types;
types.insert(make_pair(HASHKIT_HASH_MD5, "HASHKIT_HASH_MD5"));
// types.insert(make_pair(HASHKIT_HASH_CRC, "HASHKIT_HASH_CRC"));
types.insert(make_pair(HASHKIT_HASH_FNV1_64, "HASHKIT_HASH_FNV1_64"));
// types.insert(make_pair(HASHKIT_HASH_FNV1A_64, "HASHKIT_HASH_FNV1A_64"));
// types.insert(make_pair(HASHKIT_HASH_FNV1_32, "HASHKIT_HASH_FNV1_32"));
// types.insert(make_pair(HASHKIT_HASH_FNV1A_32, "HASHKIT_HASH_FNV1A_32"));
types.insert(make_pair(HASHKIT_HASH_HSIEH, "HASHKIT_HASH_HSIEH"));
types.insert(make_pair(HASHKIT_HASH_MURMUR, "HASHKIT_HASH_MURMUR"));
types.insert(make_pair(HASHKIT_HASH_JENKINS, "HASHKIT_HASH_JENKIN"));
gen_rand_strings(TEST_CASES);
for_each(types.begin(), types.end(), bind(exec_test, boost::lambda::_1));
return 0;
}
libmemcachedのバージョンは0.4.0、boostは1.34.1その他環境は以下の通り
% c++ -v
Using built-in specs.
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Debian 4.3.2-1.1' --with-bugurl=file:///usr/share/doc/gcc-4.3/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++ --prefix=/usr --enable-shared --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --enable-nls --with-gxx-include-dir=/usr/include/c++/4.3 --program-suffix=-4.3 --enable-clocale=gnu --enable-libstdcxx-debug --enable-objc-gc --enable-mpfr --enable-cld --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.3.2 (Debian 4.3.2-1.1)
% uname -a
Linux chameleon 2.6.26-2-amd64 #1 SMP Thu Feb 11 00:59:32 UTC 2010 x86_64 GNU/Linux
ビルドは以下コマンド
c++ -lhashkit hash.cc
ランダム文字列(10文字)の場合。(データが長くなるので全部のアルゴリズムの結果は載せてません)
>>HASHKIT_HASH_MD5
conflict: 0
deviation: 1238971077
histogram:
1051
1004
1026
1028
1019
981
996
953
949
992
>>HASHKIT_HASH_FNV1_64
conflict: 0
deviation: 1243801177
histogram:
1027
977
1002
1020
995
985
976
988
1033
996
>>HASHKIT_HASH_HSIEH
conflict: 0
deviation: 1235906590
histogram:
945
1017
999
981
1034
1036
1002
954
1005
1026
>>HASHKIT_HASH_MURMUR
conflict: 0
deviation: 1238293295
histogram:
980
964
1008
958
1015
997
1004
1013
1069
991
>>HASHKIT_HASH_JENKIN
conflict: 0
deviation: 1236833209
histogram:
975
941
1028
1019
967
1028
966
1037
1002
1036
予想通りどのアルゴリズムもそれなりにきれいに分散しているように見える。
10万個程度までならほとんど衝突もない。
数字の場合。(e.g. 0000000001, 0000000002, ...)
>>HASHKIT_HASH_MD5
conflict: 0
deviation: 1242254142
histogram:
1046
985
1031
1072
974
1003
968
938
996
986
>>HASHKIT_HASH_FNV1_64
conflict: 0
deviation: 401256730
histogram:
0
0
0
0
0
0
4000
4000
0
1999
>>HASHKIT_HASH_HSIEH
conflict: 0
deviation: 1239810127
histogram:
1031
1029
1014
1032
1058
985
956
934
959
1001
>>HASHKIT_HASH_MURMUR
conflict: 0
deviation: 1238208481
histogram:
999
971
989
989
1005
1034
1000
987
1007
1018
>>HASHKIT_HASH_JENKIN
conflict: 0
deviation: 1223190204
histogram:
933
1014
1023
989
1052
1006
1006
1022
996
958
HASHKIT_HASH_FNV1_64の偏りがひどい。(衝突はしていない)
ほかのものはそれなりの性能を発揮しているように見える。
それぞれのアルゴリズムで衝突しやすいパターンをもっと探してみようと思ったけど疲れたのでまた今度。