这是我参与「第三届青训营 -后端场」笔记创作活动的第4篇笔记
k-匿名介绍
关键概念
用户数据类型的分类:
- 显式标识符(ID):一般是个体的唯一标示,比如说姓名、 地址、电话等等,这些内容需要在公开 数据的时候删掉
- 准标识符( Quasi-identifier ,QI):能够以较高的概率结合一定的外部信息确 定一条用户记录。类似邮编、年龄、生日、 性别等不是唯一的,但是能帮助研究人员 关联相关数据的标示。
- 敏感属性:需要保护的信息。
- 非敏感属性:一般可以直接发布的信
k-匿名
通过概括(对数据进行更加概括、抽象的描述)和隐匿(不发布某 些数据项)技术,发布精度较低的数据,使得每条记录至少与数据 表中其他k-1 条记录具有完全相同的准标识符属性值,从而减少链接攻击所导致的隐私泄露。也就是公开数据中的任意准标识符 QI信息,相同的组合都需要出现至少k次。
2-匿名举例
k-匿名所能达到效果:
- 攻击者无法知道某个人是否在公开的数据中
- 给定一个人,攻击者无法确认他是否有某项敏感属性
- 攻击者无法确认某条数据对应的是哪个人
k-匿名不足
- 未排序匹配攻击: 当公开的数据记录和原始记录的顺序一样的时候,攻击者可以猜出匿名化的记录是属于谁。
- 补充数据攻击: 假如公开的数据有多种类型,如果它们k-匿名的方法不 同,那么攻击者可以通过关联多种数据推测用户信息
- 同质化攻击:某个k-匿名组内对应的敏感属性的值也完全相同,这使得攻击者可以轻易获取想要的信息
k-匿名实现
def readdata(filepath='../data', filename='users.data'):
records = []
try:
with open(os.path.join(filepath, filename), 'r') as rf:
for line in rf:
line = line.strip()
if not line:
continue
line = [a.strip() for a in line.split(',')]
# print(line)
intidx = [ATTNAME.index('age')]
for idx in intidx:
try:
line[idx] = int(line[idx])
except:
print('attribute %s, value %s, cannot be converted to number' % (ATTNAME[idx], line[idx]))
line[idx] = -1
for idx in range(len(line)):
if line[idx] == '' or line[idx] == '?':
line[idx] = '*'
records.append(line)
return records
except:
print('cannot open file: %s:%s' % (filepath, filename))
```
def generate_hierarchy_for_age(records):
youngest, oldest = float('inf'), -float('inf')
ageidx = ATTNAME.index('age')
for record in records:
if record[ageidx] == -1:
continue
if record[ageidx] > oldest:
oldest = record[ageidx]
if record[ageidx] < youngest:
youngest = record[ageidx]
print('age max: %d min: %d' % (oldest, youngest))
with open(AGECONFFILE, 'w') as wf:
for i in range(oldest + 1):
h = []
h.append(str(i))
h.append('%s-%s' % (i // 25 * 25, (i // 25 + 1) * 25))
h.append('%s-%s' % (i // 50 * 50, (i // 50 + 1) * 50))
h.append('%s-%s' % (i // 100 * 100, (i // 100 + 1) * 100))
wf.write(','.join(h))
wf.write('\n')
```
def generate_hierarchy_for_postcode(records):
postcodeset = set()
postcodeidx = ATTNAME.index('postcode')
for record in records:
if record[postcodeidx] != "*" and record[postcodeidx] not in postcodeset:
postcodeset.add(record[postcodeidx])
with open(POSTCONFFILE, 'w') as wf:
for postcode in postcodeset:
pstc = []
length = len(postcode)
for i in range(length):
pstc.append(postcode[0:length - i] + '*' * i)
wf.write(','.join(pstc))
wf.write('\n')
```
```
```
```
class KAnonymity():
def __init__(self, records):
self.records = records
self.confile = [SEXCONFFILE , AGECONFFILE, POSTCONFFILE]
def anonymize(self, qi_names=['sex', 'age', 'postcode'], k=5):
domains, gen_levels = {}, {}
qi_frequency = {} # store the frequency for each qi value
# record_att_gen_levels = [[0 for _ in range(len(qi_names))] for _ in range(len(self.records))]
assert len(self.confile) == len(qi_names), 'number of config files not equal to number of QI-names'
generalize_tree = dict()
for idx, name in enumerate(qi_names):
generalize_tree[name] = Tree(self.confile[idx])
for qiname in qi_names:
domains[qiname] = set()
gen_levels[qiname] = 0
for idx, record in enumerate(self.records):
qi_sequence = self._get_qi_values(record[:], qi_names, generalize_tree)
if qi_sequence in qi_frequency:
qi_frequency[qi_sequence].add(idx)
else:
qi_frequency[qi_sequence] = {idx}
for j, value in enumerate(qi_sequence):
domains[qi_names[j]].add(value)
# iteratively generalize the attributes with maximum distinct values
while True:
# count number of records not satisfying k-anonymity
negcount = 0
for qi_sequence, idxset in qi_frequency.items():
if len(idxset) < k:
negcount += len(idxset)
if negcount > k:
# continue generalization, since there are more than k records not satisfying k-anonymity
most_freq_att_num, most_freq_att_name = -1, None
for qiname in qi_names:
if len(domains[qiname]) > most_freq_att_num:
most_freq_att_num = len(domains[qiname])
most_freq_att_name = qiname
# find the attribute with most distinct values
generalize_att = most_freq_att_name
qi_index = qi_names.index(generalize_att)
domains[generalize_att] = set()
# generalize that attribute to one higher level
for qi_sequence in list(qi_frequency.keys()):
new_qi_sequence = list(qi_sequence)
new_qi_sequence[qi_index] = generalize_tree[generalize_att].root[qi_sequence[qi_index]][0]
new_qi_sequence = tuple(new_qi_sequence)
if new_qi_sequence in qi_frequency:
qi_frequency[new_qi_sequence].update(
qi_frequency[qi_sequence])
qi_frequency.pop(qi_sequence, 0)
else:
qi_frequency[new_qi_sequence] = qi_frequency.pop(qi_sequence)
domains[generalize_att].add(new_qi_sequence[qi_index])
gen_levels[generalize_att] += 1
else:
# end the while loop
# suppress sequences not satisfying k-anonymity
# save results and calculate distoration and precision
genlvl_att = [0 for _ in range(len(qi_names))]
dgh_att = [generalize_tree[name].level for name in qi_names]
datasize = 0
qiindex = [ATTNAME.index(name) for name in qi_names]
# used to make sure the output file keeps the same order with original data file
towriterecords = [None for _ in range(len(self.records))]
with open('../data/users_%d_kanonymity.data' %k, 'w') as wf:
for qi_sequence, recordidxs in qi_frequency.items():
if len(recordidxs) < k:
continue
for idx in recordidxs:
record = self.records[idx][:]
for i in range(len(qiindex)):
record[qiindex[i]] = qi_sequence[i]
genlvl_att[i] += generalize_tree[qi_names[i]].root[qi_sequence[i]][1]
record = list(map(str, record))
for i in range(len(record)):
if record[i] == '*' and i not in qiindex:
record[i] = '?'
towriterecords[idx] = record[:]
# wf.write(', '.join(record))
# wf.write('\n')
datasize += len(recordidxs)
for record in towriterecords:
if record is not None:
wf.write(', '.join(record))
wf.write('\n')
else:
wf.write('\n')
print('qi names: ', qi_names)
# precision = self.calc_precission(genlvl_att, dgh_att, datasize, len(qi_names))
precision = self.calc_precision(genlvl_att, dgh_att, len(self.records), len(qi_names))
distoration = self.calc_distoration([gen_levels[qi_names[i]] for i in range(len(qi_names))], dgh_att, len(qi_names))
print('precision: {}, distoration: {}'.format(precision, distoration))
break
def calc_precision(self, genlvl_att, dgh_att, datasize, attsize = 4):
return 1 - sum([genlvl_att[i] / dgh_att[i] for i in range(attsize)])/(datasize*attsize)
def calc_distoration(self, gen_levels_att, dgh_att, attsize):
print('attribute gen level:', gen_levels_att)
print('tree height:', dgh_att)
return sum([gen_levels_att[i] / dgh_att[i] for i in range(attsize)]) / attsize
def _get_qi_values(self, record, qi_names, generalize_tree):
qi_index = [ATTNAME.index(name) for name in qi_names]
seq = []
for idx in qi_index:
if idx == ATTNAME.index('age'):
if record[idx] == -1:
seq.append('0-100')
else:
seq.append(str(record[idx]))
else:
if record[idx] == '*':
# TODO, handle missing value cases
record[idx] = generalize_tree[qi_names[idx]].highestgen
seq.append(record[idx])
return tuple(seq)
class Tree:
def __init__(self, confile):
self.confile = confile
self.root = dict()
self.level = -1
self.highestgen = ''
self.buildTree()
def buildTree(self):
with open(self.confile, 'r') as rf:
for line in rf:
line = line.strip()
if not line:
continue
line = [col.strip() for col in line.split(',')]
height = len(line)-1
if self.level == -1:
self.level = height
if not self.highestgen:
self.highestgen = line[-1]
pre = None
for idx, val in enumerate(line[::-1]):
self.root[val] = (pre, height-idx)
pre = val
if __name__ == "__main__":
records = readdata()
KAnony = KAnonymity(records)
KAnony.anonymize(k = 2)
```
```