高层的数据结构
集合: Set
映射: Map
推荐阅读英文!!!编程要学英文,哈哈
使用其他数据结构提供的API
集合:set
集合中,每个元素只能存在一次
不盛放重复的二分搜索树是非常好的实现“集合的数据结构”
定义的集合接口
Set.java
package setAndMap;
public interface Set<E> {
void add(E e);
void remove(E e);
boolean contains(E e);
int getSize();
boolean isEmpty();
}
典型应用;
客户统计
词汇量统计
辅助工具,文件读取
FileOperation.java
package setAndMap;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.Scanner;
import java.util.Locale;
import java.io.File;
import java.io.BufferedInputStream;
import java.io.IOException;
// 文件相关操作
public class FileOperation {
// 读取文件名称为filename中的内容,并将其中包含的所有词语放进words中
public static boolean readFile(String filename, ArrayList<String> words){
if (filename == null || words == null){
System.out.println("filename is null or words is null");
return false;
}
// 文件读取
Scanner scanner;
try {
File file = new File(filename);
if(file.exists()){
// 文件流
FileInputStream fis = new FileInputStream(file);
// 文件缓存,扫描器
scanner = new Scanner(new BufferedInputStream(fis), "UTF-8");
// 处理方式?
scanner.useLocale(Locale.ENGLISH);
}
else {
System.out.println("File is not exists?");
return false;
}
}
catch(IOException ioe){
System.out.println("Cannot open " + filename);
return false;
}
// 简单分词
// 这个分词方式相对简陋, 没有考虑很多文本处理中的特殊问题
// 下面是啥?搞懂了个大概的思想!
if (scanner.hasNextLine()) {
// 分割?构造?成一个字符串 难道是类似于迭代器的东西?
String contents = scanner.useDelimiter("\\A").next();
// 字母的起始位置?
int start = firstCharacterIndex(contents, 0);
for (int i = start + 1; i <= contents.length(); )
// 找到非字母的位置
if (i == contents.length() || !Character.isLetter(contents.charAt(i))) {
// 分割一个单词,并保存在ArrayList中
String word = contents.substring(start, i).toLowerCase();
words.add(word);
// 找下一个字母的开始位置
start = firstCharacterIndex(contents, i);
i = start + 1;
} else
i++;
}
return true;
}
// 寻找字符串s中,从start的位置开始的第一个字母字符的位置
private static int firstCharacterIndex(String s, int start){
for( int i = start ; i < s.length() ; i ++ )
if( Character.isLetter(s.charAt(i)) )
return i;
return s.length();
}
}
基于二分搜索树实现集合
package setAndMap;
public class BSTSet<E extends Comparable<E>> implements Set<E>{
/**基于二分搜索树实现的集合,将集合包装了一下*/
private BST<E> bst;
public BSTSet(){
bst = new BST<>();
}
@Override
public int getSize(){
return bst.size();
}
@Override
public boolean isEmpty(){
return bst.isEmpty();
}
@Override
public void add(E e){
bst.add(e);
}
@Override
public boolean contains(E e){
return bst.contains(e);
}
@Override
public void remove(E e){
bst.remove(e);
}
// 测试方法
public static void main(String[] args) {
System.out.println("Pride and Prejudice");
// 文件要放在工程目录下!
ArrayList<String> words1 = new ArrayList<>();
if(FileOperation.readFile("pride-and-prejudice.txt", words1)) {
System.out.println("Total words: " + words1.size());
BSTSet<String> set1 = new BSTSet<>();
for (String word : words1)
set1.add(word);
System.out.println("Total different words: " + set1.getSize());
} else {
System.out.println("Error");
}
System.out.println();
System.out.println("A Tale of Two Cities");
ArrayList<String> words2 = new ArrayList<>();
if(FileOperation.readFile("a-tale-of-two-cities.txt", words2)){
System.out.println("Total words: " + words2.size());
BSTSet<String> set2 = new BSTSet<>();
for(String word: words2)
set2.add(word);
System.out.println("Total different words: " + set2.getSize());
} else {
System.out.println("Error");
}
}
}
基于链表实现集合
package setAndMap;
import java.util.ArrayList;
public class LinkedListSet<E> implements Set<E> {
private LinkedList<E> list;
public LinkedListSet(){
this.list = new LinkedList<>();
}
@Override
public int getSize(){
return list.getSize();
}
@Override
public boolean isEmpty(){
return list.isEmpty();
}
@Override
public boolean contains(E e){
return list.contains(e);
}
@Override
public void add(E e){
if(!this.list.contains(e))
this.list.addFirst(e);
}
@Override
public void remove(E e){
list.removeElement(e);
}
// 测试方法
public static void main(String[] args) {
System.out.println("Pride and Prejudice");
// 文件要放在工程目录下!
ArrayList<String> words1 = new ArrayList<>();
if(FileOperation.readFile("pride-and-prejudice.txt", words1)) {
System.out.println("Total words: " + words1.size());
LinkedListSet<String> set1 = new LinkedListSet<>();
for (String word : words1)
set1.add(word);
System.out.println("Total different words: " + set1.getSize());
} else {
System.out.println("Error");
}
System.out.println();
System.out.println("A Tale of Two Cities");
ArrayList<String> words2 = new ArrayList<>();
if(FileOperation.readFile("a-tale-of-two-cities.txt", words2)){
System.out.println("Total words: " + words2.size());
LinkedListSet<String> set2 = new LinkedListSet<>();
for(String word: words2)
set2.add(word);
System.out.println("Total different words: " + set2.getSize());
} else {
System.out.println("Error");
}
}
}
两种实现方法复杂度分析与比较
Main.java
package setAndMap;
import java.util.ArrayList;
public class Main {
public static double testSet(Set<String> set, String filemane){
long startTime = System.nanoTime();
System.out.println(filemane);
ArrayList<String> words = new ArrayList<>();
FileOperation.readFile(filemane, words);
System.out.println("Total words: " + words.size());
for (String word : words)
set.add(word);
System.out.println("Total different words: " + set.getSize());
long endTime = System.nanoTime();
return (endTime - startTime) / 1000000000.0;
}
public static void main(String[] args) {
String filename = "pride-and-prejudice.txt";
BSTSet<String> bstSet = new BSTSet<>();
double time1 = testSet(bstSet, filename);
System.out.println("BST Set:" + time1 + "s");
System.out.println();
LinkedListSet<String> llSet = new LinkedListSet<>();
double time2 = testSet(llSet, filename);
System.out.println("LinkedList Set:" + time2 + "s");
}
}
运行结果:
pride-and-prejudice.txt
Total words: 125901
Total different words: 6530
BST Set:0.358992487s
pride-and-prejudice.txt
Total words: 125901
Total different words: 6530
LinkedList Set:3.58517122s
Process finished with exit code 0
很明显基于二分搜索树的实现要优于链表的实现
复杂度分析:
对于:LinkedListSet:
- 增add:每次增加都会判断里面是否包含值,复杂度为O(n);
- 查contains: O(n);
- 删remove: O(n);
对于:BSTSet
这时候考虑的是树的深度,以平均情况满二叉树为例
树的深度用h表示,深度h与个数n的关系满足一个等比数列的和
n = 2^(0) + 2^(1) + ... + 2^(h-1) = 1 * (1 - 2^(h))/(1-2) = 2^(h) - 1
那么h = log2(n+1) ,也就为O(log(n)),对数的底数与线性关系的常数一样,可以忽略
- 增add:O(h); O(log(n))
- 查contains: O(h); O(log(n))
- 删remove: O(h); O(log(n))
但是当二叉树为最坏的情况,比如所有的元素都没有左子树,这时高度与n相同,复杂度也趋近于O(n)
leetCode上面的804题
package setAndMap;
// 红黑树实现的平衡树
import java.util.TreeSet;
class Solution {
public int uniqueMorseRepresentations(String[] words) {
String[] code = {".-","-...","-.-.","-..",".","..-.","--.","....","..",".---","-.-",".-..","--","-.","---",".--.","--.-",".-.","...","-","..-","...-",".--","-..-","-.--","--.."};
TreeSet<String> set = new TreeSet<>();
for(String word: words){
StringBuilder res = new StringBuilder();
for(int i = 0; i < word.length(); i++)
res.append(code[word.charAt(i) - 'a']);
set.add(res.toString());
}
return set.size();
}
}
拓展:
有序集合:元素具有顺序性,<---基于搜索树的实现
无序集合:没有顺序,<---基于哈希表的实现
多重集合:可以容纳同一种的元素