最近在工作中碰到一个需求,场景是我们需要存储很多的用户的url请求,但有很多请求是 http://192.168.2.245:8082/test/testUrl/12,
http://192.168.2.245:8082/test/testUrl/14,
http://192.168.2.245:8082/test/testUrl/18
url的path中最后是随机字符串,就会导致数据表中的数据量太大。需要把 http://192.168.2.245:8082/test/testUrl/xxx 聚合成 http://192.168.2.245:8082/test/testUrl/*。
还有就是有很多一样的请求url,也需要聚合成一个,比如有一万个www.baidu.com的请求,需要聚合成一个。 这就需求统计出以http://192.168.2.245:8082/test/testUrl为前缀的url的数量,还有 www.baidu.com 请求的数量。 想到的方法是通过前缀树来统计,通过/来切分path,切分出来分段存储到前缀树。前缀树的结构相对来说还比较简单,贴张图如下:
这张图中每个字母是一个路径,我是把每个path的分段当一个路径。最终统计出不同路径的数量。
代码如下:
class Node{
public int pass;
public int end;
public HashMap<String, Node> nexts;
public Node(){
pass =0;
end =0;
nexts = new HashMap<>();
}
}
class Trie {
private Node root;
//存储满足经过当前节点大于多少的url
private Set<String> passMatchUrl;
//存储子节点大于多少的url
private Set<String> childMatchUrl;
public Trie(){
root = new Node();
passMatchUrl = new HashSet<>();
childMatchUrl = new HashSet<>();
}
public Node getRoot() {
return root;
}
//需要排除各种重复数据
public Set<String> getMatchUrl() {
Set<String> result = new HashSet<>();
Comparator<String> compByLength = (a, b) -> b.length() - a.length();
childMatchUrl.addAll(passMatchUrl);
List<String> clildList = childMatchUrl.stream().sorted(compByLength).collect(Collectors.toList());
for (int i = 0; i < clildList.size() - 1; i++) {
boolean start = false;
for (int j = i + 1; j < clildList.size() - 1; j++) {
if (clildList.get(i).startsWith(clildList.get(j))) {
start = true;
break;
}
}
if (!start) {
result.add(clildList.get(i));
}
}
return result;
}
public void insert(String url){
if (!StringUtils.hasLength(url)){
return;
}
String host = "";
int port = -1;
String path = "";
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
if (!url.startsWith("http")) {
String urlWithoutProtocol = url.substring(url.indexOf(":")+3);
url = "http://"+urlWithoutProtocol;
}
try {
URL urlParse = new URL(url);
host = urlParse.getHost();
port = urlParse.getPort();
path = urlParse.getPath();
} catch (Exception e) {
e.printStackTrace();
}
String matchPath = "";
Node node = root;
node.pass++;
//添加协议
if (!node.nexts.containsKey(protocol)){
node.nexts.put(protocol,new Node());
}
//当前节点
node = node.nexts.get(protocol);
node.pass++;//当前节点++
//添加域名和端口
String domain = port != -1 ? host+":"+port : host;
if (!node.nexts.containsKey(domain)){
node.nexts.put(domain,new Node());
}
//当前节点
node = node.nexts.get(domain);
node.pass++;//当前节点++
matchPath = domain;
if (node.pass > Config.FILTER_NUM_LIMIT) {
passMatchUrl.add(matchPath);
}
int childSize = node.nexts.size();
if (childSize > Config.FILTER_CLILD_LIMIT) {
childMatchUrl.add(matchPath);
}
//添加各级path 例如 /a/222/b/1dddd 会拆分成 /a、/222、/b和/1dddd存到前缀树中
if (StringUtils.hasLength(path)) {
String[] pathArr = path.split("/");
for (int i = 1; i < pathArr.length; i++) {
String subPath = "/" + pathArr[i];
String parentPath = matchPath;
matchPath += subPath;
if (!node.nexts.containsKey(subPath)){
node.nexts.put(subPath,new Node());
}
//当前节点
node = node.nexts.get(subPath);
childSize = node.nexts.size();
if (childSize > Config.FILTER_CLILD_LIMIT) {
childMatchUrl.add(matchPath);
}
node.pass++;//当前节点++
if (node.pass > Config.FILTER_NUM_LIMIT) {
passMatchUrl.add(matchPath);
passMatchUrl.remove(parentPath);
}
}
}
//循环结束后,说明路径已经遍历完
node.end++;
}
}