2021년 3월 20일에 업데이트하였습니다.
해싱 알고리즘에는 여러 알고리즘이 있는데 동적 완벽 해싱(dynamic perfect hashing)에 perfect 라는 단어 때문에 궁극의 해싱인가 싶어서 구현해봤습니다.
https://en.wikipedia.org/wiki/Dynamic_perfect_hashing
논문 제목: Dynamic Perfect Hashing: Upper and Lower Bounds
동적 완벽 해싱이 이론 상 좋아보일 수는 있습니다. 하지만, 동적 완벽 해싱은 데이터를 삽입할 때 충돌이 발생하면 랜덤으로 해시 함수를 준비하여 다시 해싱을 하는 과정을 거칩니다. 이 과정에서 시간이 지연됩니다.
논문에서 h(x) 는 다음과 같습니다. k 의 범위는 1 ≤ k ≤ p - 1 입니다.
p 는 prime, s 는 공간 size 입니다.
key 를 주면 그걸 유니버설 해시 함수로 해시값을 만듭니다. mod
연산에서 시간 잡아먹고요.
h(x) = ((k * x) mod p) mod s
그리고…
알고리즘을 보면 insert 할 때 느린 건 이해는 하겠는데 lookup 할 때도 h(x) 를 두번 거치고, 수많은
버킷 배열과 엔트리 배열 때문에 시간이 지연됩니다. 그래서 해싱과는 별개로 배열 수만 늘려보고 테스트를
해봤는데, 단순히 배열 수만 늘어나도 array[i]
의 참조 연산 속도가 떨어지더군요. 이론과 실제는
다르다는 것을 또 다시 느낍니다.
i = h(x, random, PRIME, buckets_len)
entries = buckets[i]
j = h(x, random, PRIME, entries_len)
entry = entries[j]
그러면 이 알고리즘은 쓸모가 없는가? 그렇지 않습니다. 기존의 해시 테이블에서 충돌 해결 기법으로 동적 완벽 해싱을 사용하면 매우 좋을 것 같습니다.
다음 소스코드는 제가 논문 뒤져보면서 작성한 코드입니다.
논문에 몇몇 오류가 있는데 가급적 논문에 나온 그대로 작성하였습니다. 논문과 일부 다른 부분도 있습니다.
컴파일 방법은 다음과 같습니다. 최적화 옵션 -O1
을 넣지 않으면 너무 느리니 최적화 옵션을 넣어서
컴파일해보시기 바랍니다.
cc -O1 -Wall -o tian-dynfect tian-dynfect.c
/*
* tian-dynfect.c
* Copyright (C) 2021 Hodong Kim, All rights reserved.
* Unauthorized copying of this software, via any medium is strictly prohibited.
* Proprietary and confidential.
* Written by Hodong Kim <hodong@nimfsoft.com> January,Febrary 2021
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <assert.h>
#include <time.h>
typedef void (* TianFreeFunc) (void *data);
typedef bool (* TianEqualFunc) (const void *a, const void *b);
typedef struct _TianArray TianArray;
struct _TianArray {
void **data;
unsigned int len;
};
typedef struct _TianArrayPrivate TianArrayPrivate;
struct _TianArrayPrivate {
void **data;
unsigned int len;
TianFreeFunc free_func;
unsigned int capacity;
};
TianArray *tian_array_new (TianFreeFunc free_func)
{
TianArrayPrivate *array;
array = malloc (sizeof (TianArrayPrivate));
array->capacity = 1;
array->data = malloc (array->capacity * sizeof (void *));
array->len = 0;
array->free_func = free_func;
return (TianArray *) array;
}
void **tian_array_free (TianArray *array, bool free_data)
{
TianArrayPrivate *priv = (TianArrayPrivate *) array;
if (free_data)
{
if (priv->free_func)
for (int i = 0; i < array->len; i++)
if (array->data[i])
priv->free_func (array->data[i]);
free (array->data);
free (array);
return NULL;
}
else
{
void **data = array->data;
free (array);
return data;
}
}
void tian_array_clear (TianArray *array)
{
TianArrayPrivate *priv = (TianArrayPrivate *) array;
if (priv->free_func)
for (int i = 0; i < array->len; i++)
if (array->data[i])
priv->free_func (array->data[i]);
priv->capacity = 1;
array->data = realloc (priv->data, priv->capacity * sizeof (void *));
array->len = 0;
}
void tian_array_add (TianArray *array, void *data)
{
TianArrayPrivate *priv = (TianArrayPrivate *) array;
if (array->len == priv->capacity)
{
priv->capacity = priv->capacity * 2;
array->data = realloc (array->data, sizeof (void *) * priv->capacity);
}
array->data[array->len] = data;
array->len++;
}
/*****************************************************************************/
typedef uint64_t (* TianHashFunc) (const void *key);
typedef struct _TianEntry TianEntry;
struct _TianEntry {
void *key;
void *value;
};
TianEntry *tian_entry_new (void *key, void *value)
{
TianEntry *entry = malloc (sizeof (TianEntry));
entry->key = key;
entry->value = value;
return entry;
}
void tian_entry_free (TianEntry *entry,
TianFreeFunc key_free_func,
TianFreeFunc value_free_func)
{
if (key_free_func)
key_free_func (entry->key);
if (value_free_func)
value_free_func (entry->value);
free (entry);
}
/*****************************************************************************/
bool tian_direct_equal (const void *a, const void *b)
{
return a == b;
}
bool tian_str_equal (const void *s1, const void *s2)
{
return strcmp (s1, s2) == 0;
}
/*****************************************************************************/
/* RAND_MAX: 0x7fffffffU This is a prime number */
/* UINT32_MAX: 0xffffffffU */
#if defined(__FreeBSD__) || defined(__linux__)
#define PRIME 0xfffffffbU /* for arc4random_uniform() or getrandom() */
#else
#define PRIME 0x7fffffffU /* for rand() */
#endif
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
typedef struct _TianHashable TianHashable;
struct _TianHashable {
unsigned random;
unsigned len;
};
typedef unsigned (* TianUHashFunc) (TianHashable *hashable,
const void *key);
typedef struct _TianBucket TianBucket;
struct _TianBucket {
unsigned random; /* 1 <= k <= prime - 1 */
unsigned entries_len; /* entries_len = 2 * m * (m - 1) */
unsigned m; /* entries_len = 2 * m * (m - 1) */
unsigned b; /* rehash: b = lists[i]->len, insert: b = b + 1 */
TianEntry **entries;
};
/* constant */
#define DYNFECT_C 1 /* m = (1 + c) * MAX (count, 4) in rehash() */
#define DYNFECT_S 1 /* buckets_len = S(M) = DYNFECT_S * dynfect->m */
typedef struct _TianDynfect TianDynfect;
struct _TianDynfect {
unsigned random; /* 1 <= k <= prime - 1 */
unsigned buckets_len; /* buckets_len = S(M) = DYNFECT_S * dynfect->m */
TianUHashFunc hash_func;
TianEqualFunc key_equal_func;
TianFreeFunc key_free_func;
TianFreeFunc value_free_func;
TianBucket *buckets;
unsigned op_count; /* count++ when add, remove */
unsigned m; /* m = (1 + c) * MAX (count, 4) in rehash() */
unsigned n_items;
};
/* universal hash for integer */
static unsigned tian_hashable_uhash_int (TianHashable *hashable,
const void *key)
{
/* h(x) = (kx mod p) mod s; 1 <= k <= prime - 1 */
/* prime >= hashable->len */
return (((unsigned long) hashable->random * (unsigned long) key) %
(unsigned long) PRIME) % (unsigned long) hashable->len;
}
static unsigned tian_dynfect_gen_random ()
{
#if !defined(__FreeBSD__) && !defined(__linux__)
#if RAND_MAX != 0x7fffffffU
#error "Debug this source code"
#endif
static long seed = -1;
if (seed < 0)
{
/* This is insecure */
seed = time (NULL);
srand (seed);
}
#endif
/* h(x) = (kx mod p) mod s; 1 <= k <= prime - 1 */
#ifdef __FreeBSD__
return arc4random_uniform (PRIME - 1) + 1; /* uint32_t */
#elif __linux__
uint32_t buf;
(void) getrandom (&buf, sizeof (unsigned), 0);
return buf % (PRIME - 1) + 1;
#else
return rand() % (PRIME - 1) + 1; /* int */
#endif
}
static bool tian_dynfect_condition (TianDynfect *dynfect)
{
unsigned sum = 0;
/* 32 * M * M / S(M) + 4 * M */
/* S(M) is buckets_len */
/* buckets_len is DYNFECT_S * dynfect->m */
unsigned bound = 32 * dynfect->m / DYNFECT_S + 4 * dynfect->m;
for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
sum = sum + dynfect->buckets[i].m;
if (sum > bound)
return false;
}
return true;
}
static bool tian_dynfect_inject (TianDynfect *dynfect,
TianArray *list,
TianBucket *bucket)
{
TianEntry *entry;
unsigned index;
bucket->random = tian_dynfect_gen_random ();
for (unsigned i = 0; i < list->len; i++)
{
entry = list->data[i];
index = dynfect->hash_func ((TianHashable *) bucket, entry->key);
if (bucket->entries[index])
return false;
bucket->entries[index] = entry;
}
return true;
}
static bool tian_dynfect_contains (TianDynfect *dynfect,
const void *key)
{
unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
TianBucket *bucket = &dynfect->buckets[i];
if (bucket->entries_len > 0)
{
int pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];
if (entry && dynfect->key_equal_func (entry->key, key))
return true;
}
return false;
}
static void tian_dynfect_rehash (TianDynfect *dynfect, TianEntry *new_entry)
{
TianArray *list;
TianArray **lists;
list = tian_array_new (NULL);
for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];
for (unsigned j = 0; j < bucket->entries_len; j++)
if (bucket->entries[j])
tian_array_add (list, bucket->entries[j]);
free (bucket->entries);
}
if (new_entry)
{
tian_array_add (list, new_entry);
dynfect->n_items++;
}
dynfect->op_count = list->len;
dynfect->m = (1 + DYNFECT_C) * MAX (dynfect->op_count, 4);
dynfect->buckets_len = DYNFECT_S * dynfect->m;
dynfect->buckets = realloc (dynfect->buckets,
dynfect->buckets_len * sizeof (TianBucket));
memset (dynfect->buckets, 0, dynfect->buckets_len * sizeof (TianBucket));
lists = calloc (dynfect->buckets_len, sizeof (TianArray *));
do {
dynfect->random = tian_dynfect_gen_random ();
for (unsigned i = 0; i < dynfect->buckets_len; i++)
if (lists[i] && lists[i]->len)
tian_array_clear (lists[i]);
for (unsigned i = 0; i < list->len; i++)
{
TianEntry *entry = list->data[i];
unsigned pos = dynfect->hash_func ((TianHashable *) dynfect, entry->key);
if (!lists[pos])
lists[pos] = tian_array_new (NULL);
tian_array_add (lists[pos], entry);
}
for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];
memset (bucket, 0, sizeof (TianBucket));
bucket->b = lists[i] ? lists[i]->len : 0;
bucket->m = 2 * bucket->b;
bucket->entries_len = 2 * bucket->m * (bucket->m - 1);
}
} while (!tian_dynfect_condition (dynfect));
tian_array_free (list, true);
for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];
if (lists[i] && (lists[i]->len > 0))
{
bucket->entries = calloc (bucket->entries_len, sizeof (TianEntry *));
do {
memset (bucket->entries, 0, bucket->entries_len * sizeof (TianEntry *));
} while (!tian_dynfect_inject (dynfect, lists[i], bucket));
}
if (lists[i])
tian_array_free (lists[i], true);
}
free (lists);
}
static TianDynfect *tian_dynfect_new (TianUHashFunc hash_func,
TianEqualFunc key_equal_func,
TianFreeFunc key_free_func,
TianFreeFunc value_free_func)
{
TianDynfect *dynfect = malloc (sizeof (TianDynfect));
dynfect->hash_func = hash_func;
dynfect->key_equal_func = key_equal_func;
dynfect->key_free_func = key_free_func;
dynfect->value_free_func = value_free_func;
dynfect->op_count = 0;
dynfect->buckets_len = 0;
dynfect->buckets = NULL;
dynfect->n_items = 0;
dynfect->m = 0;
tian_dynfect_rehash (dynfect, NULL);
return dynfect;
}
static bool tian_dynfect_insert (TianDynfect *dynfect,
void *key,
void *value)
{
unsigned index = dynfect->hash_func ((TianHashable *) dynfect, key);
unsigned pos;
TianBucket *bucket = &dynfect->buckets[index];
if (bucket->entries_len > 0)
{
pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];
if (entry)
{
if (dynfect->key_equal_func (entry->key, key))
{
if (dynfect->key_free_func)
dynfect->key_free_func (entry->key);
if (dynfect->value_free_func)
dynfect->value_free_func (entry->value);
entry->key = key;
entry->value = value;
return false;
}
}
}
dynfect->op_count++;
if (dynfect->op_count > dynfect->m)
{
tian_dynfect_rehash (dynfect, tian_entry_new (key, value));
return true;
}
bucket->b++;
if (bucket->b <= bucket->m)
{
if (!bucket->entries[pos])
{
bucket->entries[pos] = tian_entry_new (key, value);
}
else
{
TianArray *list;
list = tian_array_new (NULL);
for (unsigned i = 0; i < bucket->entries_len; i++)
if (bucket->entries[i])
tian_array_add (list, bucket->entries[i]);
tian_array_add (list, tian_entry_new (key, value));
do {
if (bucket->entries == NULL)
bucket->entries = calloc (bucket->entries_len, sizeof (TianEntry *));
else
memset (bucket->entries,
0, bucket->entries_len * sizeof (TianEntry *));
} while (!tian_dynfect_inject (dynfect, list, bucket));
tian_array_free (list, true);
}
}
else
{
if (tian_dynfect_condition (dynfect))
{
TianArray *list;
list = tian_array_new (NULL);
if (bucket->entries)
for (unsigned j = 0; j < bucket->entries_len; j++)
if (bucket->entries[j])
tian_array_add (list, bucket->entries[j]);
tian_array_add (list, tian_entry_new (key, value));
bucket->m = 2 * MAX (bucket->m, 1);
bucket->entries_len = 2 * (bucket->m) * (bucket->m - 1);
bucket->entries = realloc (bucket->entries,
bucket->entries_len * sizeof (TianEntry *));
do {
memset (bucket->entries, 0, bucket->entries_len * sizeof (TianEntry *));
} while (!tian_dynfect_inject (dynfect, list, bucket));
tian_array_free (list, true);
}
else
{
tian_dynfect_rehash (dynfect, tian_entry_new (key, value));
}
}
dynfect->n_items++;
return true;
}
static bool tian_dynfect_remove (TianDynfect *dynfect, const void *key)
{
dynfect->op_count++;
unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
TianBucket *bucket = &dynfect->buckets[i];
if (bucket->entries_len < 1)
return false;
int pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];
if (entry == NULL)
return false;
tian_entry_free (entry,
dynfect->key_free_func,
dynfect->value_free_func);
bucket->entries[pos] = NULL;
dynfect->n_items--;
if (dynfect->op_count >= dynfect->m)
tian_dynfect_rehash (dynfect, NULL);
return true;
}
static void *tian_dynfect_lookup (TianDynfect *dynfect,
const void *key)
{
unsigned i = dynfect->hash_func ((TianHashable *) dynfect, key);
TianBucket *bucket = &dynfect->buckets[i];
if (bucket->entries_len > 0)
{
int pos = dynfect->hash_func ((TianHashable *) bucket, key);
TianEntry *entry = bucket->entries[pos];
if (entry && dynfect->key_equal_func (entry->key, key))
return entry->value;
}
return NULL;
}
static void tian_dynfect_free (TianDynfect *dynfect)
{
for (unsigned i = 0; i < dynfect->buckets_len; i++)
{
TianBucket *bucket = &dynfect->buckets[i];
for (unsigned j = 0; j < bucket->entries_len; j++)
{
TianEntry *entry = bucket->entries[j];
if (entry)
tian_entry_free (entry,
dynfect->key_free_func,
dynfect->value_free_func);
}
free (bucket->entries);
}
free (dynfect->buckets);
free (dynfect);
}
unsigned tian_dynfect_size (TianDynfect *dynfect)
{
return dynfect->n_items;
}
/*****************************************************************************/
int main (int argc, char **argv)
{
puts ("====== start of dynfect test");
struct timespec ts1, ts2;
TianDynfect *dynfect;
dynfect = tian_dynfect_new (tian_hashable_uhash_int, tian_direct_equal, NULL, NULL);
/* replace test */
clock_gettime (CLOCK_MONOTONIC, &ts1);
for (uint32_t i = 0; i < 1000; i++)
tian_dynfect_insert (dynfect, (void *) (uint64_t) 10,
(void *) (uint64_t) i);
clock_gettime (CLOCK_MONOTONIC, &ts2);
printf ("tian: added 1000 times: %ld ms\n",
((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));
assert (tian_dynfect_size (dynfect) == 1);
for (uint32_t i = 0; i < 1000; i++)
{
uint32_t retval;
retval = (uint32_t) (uint64_t) tian_dynfect_lookup (dynfect, (void *) (uint64_t) 10);
assert (retval == 999);
assert (tian_dynfect_contains (dynfect, (void *) (uint64_t) 10));
}
assert (tian_dynfect_remove (dynfect, (void *) 10));
clock_gettime (CLOCK_MONOTONIC, &ts1);
for (uint32_t i = 0; i < 10000; i++)
tian_dynfect_insert (dynfect, (void *) (uint64_t) i,
(void *) (uint64_t) i);
clock_gettime (CLOCK_MONOTONIC, &ts2);
printf ("tian: added 10000 times: %ld ms\n",
((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));
clock_gettime (CLOCK_MONOTONIC, &ts1);
for (uint32_t i = 0; i < 10000; i++)
{
uint32_t retval;
retval = (uint32_t) (uint64_t) tian_dynfect_lookup (dynfect, (void *) (uint64_t) i);
assert (i == retval);
}
printf ("tian: lookup 10000 times: %ld ms\n",
((ts2.tv_sec * 1000000000 + ts2.tv_nsec) - (ts1.tv_sec * 1000000000 + ts1.tv_nsec)) / 1000000);
printf ("tian_hash_table_size (dynfect): %u\n", tian_dynfect_size (dynfect));
tian_dynfect_free (dynfect);
return 0;
}