planes_l = [np.random.normal(size=(N_DIMS, N_PLANES))
for _ in range(N_UNIVERSES)]
def hash_value_of_vector(v, planes):
"""Create a hash for a vector; hash_id says which random hash to use.
Input:
- v: vector of tweet. It's dimension is (1, N_DIMS)
- planes: matrix of dimension (N_DIMS, N_PLANES) - the set of planes that divide up the region
Output:
- res: a number which is used as a hash for your vector
"""
# for the set of planes,
# calculate the dot product between the vector and the matrix containing the planes
# remember that planes has shape (300, 10)
# The dot product will have the shape (1,10)
dot_product = np.dot(v, planes)
# get the sign of the dot product (1,10) shaped vector
sign_of_dot_product = np.sign(dot_product)
# set h to be false (eqivalent to 0 when used in operations) if the sign is negative,
# and true (equivalent to 1) if the sign is positive (1,10) shaped vector
#h = np.where(sign_of_dot_product<=0, 0, sign_of_dot_product)
h = sign_of_dot_product>=0
# remove extra un-used dimensions (convert this from a 2D to a 1D array)
h = np.squeeze(h)
# initialize the hash value to 0
hash_value = 0
n_planes = planes.shape[1]
for i in range(n_planes):
# increment the hash value by 2^i * h_i
hash_value += np.power(2,i)*h[i]
# cast hash_value as an integer
hash_value = int(hash_value)
return hash_value
def make_hash_table(vecs, planes):
"""
Input:
- vecs: list of vectors to be hashed.
- planes: the matrix of planes in a single "universe", with shape (embedding dimensions, number of planes).
Output:
- hash_table: dictionary - keys are hashes, values are lists of vectors (hash buckets)
- id_table: dictionary - keys are hashes, values are list of vectors id's
(it's used to know which tweet corresponds to the hashed vector)
"""
# number of planes is the number of columns in the planes matrix
num_of_planes = planes.shape[1]
# number of buckets is 2^(number of planes)
num_buckets = 2**num_of_planes
# create the hash table as a dictionary.
# Keys are integers (0,1,2.. number of buckets)
# Values are empty lists
hash_table = {i:[] for i in range(num_buckets)}
# create the id table as a dictionary.
# Keys are integers (0,1,2... number of buckets)
# Values are empty lists
id_table = {i:[] for i in range(num_buckets)}
# for each vector in 'vecs'
for i, v in enumerate(vecs):
# calculate the hash value for the vector
h = hash_value_of_vector(v, planes)
# store the vector into hash_table at key h,
# by appending the vector v to the list at key h
hash_table[h].append(v)
# store the vector's index 'i' (each document is given a unique integer 0,1,2...)
# the key is the h, and the 'i' is appended to the list at key h
id_table[h].append(i)
return hash_table, id_table
hash_tables = []
id_tables = []
for universe_id in range(N_UNIVERSES): # there are 25 hashes
print('working on hash universe #:', universe_id)
planes = planes_l[universe_id]
hash_table, id_table = make_hash_table(document_vecs, planes)
hash_tables.append(hash_table)
id_tables.append(id_table)