当前位置：首页 > news >正文

学习Trie树，处理“海量”数据

news 来源：原创 2024/5/18 13:26:55

目前我所了解的海量数据处理的方法包括：哈希，树，归并排序，bitmap等，之前发过测试哈希表的随笔，这一篇则是针对第二种方法：树。

我写了一个Trie树的简单实现，以摘自某网站上的超过100万条的URL为测试数据，算是个小型的海量数据了，检索其中一条记录所消耗的时间为0毫秒，实际是在1毫秒之内，也就是很快的意思。

清空这个树需要DFS或者BFS，我暂时没有实现它，也就是说，下面的代码，没有主动完成释放内存空间的功能。

欢迎有共同学习兴趣的同学和我沟通哦 ^_^

上代码：

  1 #include <stdlib.h>
  2 #include <string>
  3 #include <fstream>
  4 #include <sys/timeb.h>
  5 /**
  6 * @author:zanzan101
  7 */
  8 
  9 using namespace std;
 10 
 11 typedef struct Node
 12 {
 13 
 14     // 有三种以上的实现方式，包括静态数据，链表，左孩子右兄弟，双数组等等
 15     // 静态数据省时，费空间
 16     // 链表节省空间，但费时
 17     // 左孩子右兄弟，费时
 18     // 双数组实现麻烦
 19     
 20     // 这里是用的最简单的静态数组的方法
 21     struct Node* child[256];
 22     int value;
 23     Node():value(0)
 24     {
 25         memset(child, 0, 256*sizeof(void*));
 26     };
 27     Node(int v):value(v)
 28     {
 29         memset(child, 0, 256*sizeof(void*));
 30     };
 31 }* TrieTree;
 32 
 33 void insert_data(TrieTree tree, const char* str)
 34 {
 35     if(!tree)
 36         return;
 37     int len = strlen(str);
 38     int idx = 0;
 39     Node* p = tree;
 40     while(idx < len-1)
 41     {
 42         if(p->child[str[idx]])
 43         {
 44             p = p->child[str[idx]];
 45         }else
 46         {
 47             p->child[str[idx]] = new Node();
 48             p = p->child[str[idx]];
 49         }
 50         idx ++;
 51     }
 52     if(p->child[str[idx]])
 53         p->child[str[idx]]->value += 1;
 54     else
 55         p->child[str[idx]] = new Node(1);
 56 }
 57 
 58 int find_data(TrieTree tree, const char* str)
 59 {
 60     if(!tree)
 61         return 0;
 62     int len = strlen(str);
 63     int idx = 0;
 64     Node* p = tree;
 65     while(idx < len)
 66     {
 67         if(p->child[str[idx]])
 68         {
 69             p = p->child[str[idx]];
 70         }else
 71         {
 72             printf("没有：%s\n", str);
 73             return 0;
 74         }
 75         idx ++;
 76     }
 77     if(p->value)
 78         printf("找到：%s\n", str);
 79     else
 80         printf("没有：%s\n", str);
 81     return p->value;
 82 }
 83 
 84 void delete_data(TrieTree tree, const char* str)
 85 {
 86 
 87     if(!tree)
 88         return;
 89     int len = strlen(str);
 90     int idx = 0;
 91     Node* p = tree;
 92     while(idx < len)
 93     {
 94         if(p->child[str[idx]])
 95         {
 96             p = p->child[str[idx]];
 97         }else
 98             return;
 99         idx ++;
100     }
101     if(p->value)
102         p->value = 0;        // 其实这里可以直接变成0，也可以减1，根据需求而定，即删除一条重复存在的记录时，应该全部删除还是使其数目减一
103     else
104         p->value = 0;
105     // 理论上，这里应该检查他的孩子是不是全空，如果是全部value为0，则应该释放孩子的空间。
106     // 这个功能用DFS或BFS都可以，我这里暂时还没有实现
107 }
108 
109 int main()
110 {
111     timeb time_begin;
112     timeb time_end;
113     unsigned int seconds;
114     unsigned int miseconds;
115     ftime(&time_begin);    
116     ftime(&time_end);
117     seconds = time_end.time - time_begin.time;
118     miseconds = time_end.millitm - time_begin.millitm;
119     miseconds = seconds * 1000 + miseconds;
120 
121     TrieTree tree = new Node();
122     
123 
124     // 测试插入功能
125     printf(">> 测试插入功能：\n");
126     ifstream input_file;
127     input_file.open("D:\\url_test.txt");
128     if(!input_file.is_open())
129     {
130         printf(">> 打开文件错误！\n");
131         system("pause");
132         return 0;
133     }
134     char* buff = new char[256];                    // 这里限定了字符串长度不应超过255字节
135     int count = 0;
136     while(input_file.getline(buff, 255) && ++count)
137         insert_data(tree, buff);
138     printf("插入了%d条记录\n", count);
139     input_file.close();
140     
141     // 测试查找功能
142     printf(">> 测试查找功能：\n");
143     const char* url_a = "http://www.hello.com";                                // 这个在文件中存在
144     const char* url_b = "http://www.hello.com/page/act.php";                // 这个在文件中不存在
145     const char* url_c = "http://www.hello.com/page/act.php?id=1&type=2";    // 这个在文件中存在
146     find_data(tree, url_a);
147     find_data(tree, url_b);
148     find_data(tree, url_c);
149 
150     // 测试查询时间
151     printf(">> 测试在%d条记录中查询一条记录的时间\n", count);
152     const char* url_test_speed = "http://s.click.taobao.com/t?e=zGU34CA7K%2BPkqB05%2Bm7rfGGjlY60oHcc7bkKOQiQwA%2Bn8dIL0yz4XEf7SJJqAW9QTXy4suwjS35MBTv2weLfJpv1G1NPH2kOWiZdmW9hl%2FCQQEpYwkL6cw%3D%3D";
153 
154     ftime(&time_begin);    
155     find_data(tree, url_test_speed);
156     ftime(&time_end);
157     seconds = time_end.time - time_begin.time;
158     miseconds = time_end.millitm - time_begin.millitm;
159     miseconds = seconds * 1000 + miseconds;
160     printf("本次查找耗时：\t%u\t毫秒\n", miseconds);
161 
162     // 测试删除功能
163     printf(">> 测试删除功能：\n");
164     printf("删除：%s\n", url_a);
165     delete_data(tree, url_a);
166     printf(">> 再查找：\n");
167     find_data(tree, url_a);
168     printf("\n删除：%s\n", url_b);
169     delete_data(tree, url_b);
170     printf(">> 再查找：\n");
171     find_data(tree, url_b);
172     printf("\n删除：%s\n", url_c);
173     delete_data(tree, url_c);
174     printf(">> 再查找：\n");
175     find_data(tree, url_c);
176     printf("\n删除：%s\n", url_test_speed);
177     delete_data(tree, url_test_speed);
178     printf(">> 再查找：\n");
179     find_data(tree, url_test_speed);
180     system("pause");
181     return 0;
182 }

输出结果：

>> 测试插入功能：
插入了1015746条记录
>> 测试查找功能：
找到：http://www.hello.com
没有：http://www.hello.com/page/act.php
找到：http://www.hello.com/page/act.php?id=1&type=2
>> 测试在1015746条记录中查询一条记录的时间
找到：http://s.click.taobao.com/t?e=zGU34CA7K%2BPkqB05%2Bm7rfGGjlY60oHcc7bkKOQiQwA%2Bn8dIL0yz4XEf7SJJqAW9QTXy4suwjS35MBTv2weLfJpv1G1NPH2kOWiZdmW9hl%2FCQQEpYwkL6cw%3D%3D
本次查找耗时：  0       毫秒
>> 测试删除功能：
删除：http://www.hello.com
>> 再查找：
没有：http://www.hello.com

删除：http://www.hello.com/page/act.php
>> 再查找：
没有：http://www.hello.com/page/act.php

删除：http://www.hello.com/page/act.php?id=1&type=2
>> 再查找：
没有：http://www.hello.com/page/act.php?id=1&type=2

删除：http://s.click.taobao.com/t?e=zGU34CA7K%2BPkqB05%2Bm7rfGGjlY60oHcc7bkKOQiQwA%2Bn8dIL0yz4XEf7SJJqAW9QTXy4suwjS35MBTv2weLfJpv1G1NPH2kOWiZdmW9hl%2FCQQEpYwkL6cw%3D%3D
>> 再查找：
没有：http://s.click.taobao.com/t?e=zGU34CA7K%2BPkqB05%2Bm7rfGGjlY60oHcc7bkKOQiQwA%2Bn8dIL0yz4XEf7SJJqAW9QTXy4suwjS35MBTv2weLfJpv1G1NPH2kOWiZdmW9hl%2FCQQEpYwkL6cw%3D%3D
请按任意键继续. . .