Unicode
www.unicode.org/Public/UNIDATA/Blocks.txt
Utf-8编码规则
Unicode 和 UTF-8 有什么区别? - 知乎 (zhihu.com)
代码
package com.vege;
import org.apache.tomcat.util.buf.HexUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
public class UnicodeToUtf8 {
// 字符和十六进制数的映射
private static final Map<Character, Integer> map = new HashMap<>();
static {
map.put('0', 0);
map.put('1', 1);
map.put('2', 2);
map.put('3', 3);
map.put('4', 4);
map.put('5', 5);
map.put('6', 6);
map.put('7', 7);
map.put('8', 8);
map.put('9', 9);
map.put('A', 10);
map.put('B', 11);
map.put('C', 12);
map.put('D', 13);
map.put('E', 14);
map.put('F', 15);
}
public static void main(String[] args) throws IOException {
// 打印unicode中, "0x4DC0"对应的utf8编码的字符
String temp = unicodeToUtf8Str(hexStrToInt("4DC0"));
System.out.println("0x4DC0 - " + temp);
System.out.println();
// ======================================================
// 打印从from到to的所有的unicode字符
// 每16个一行
int from = 0x4DC0;
int to = 0x4DFF;
for (int i = from, j = 0; i < to; i++, j++) {
if (j % 16 == 0) {
System.out.println();
String start = "0x" + HexUtils.toHexString(intToByteArrayHighFirst(i)).substring(2);
String end = start.substring(0, start.length() - 1) + "f";
System.out.println("-> " + start + " - " + end + " : ");
}
System.out.print("\t");
System.out.print(unicodeToUtf8Str(i));
}
System.out.println();
}
/**
* 字符串形式的十六进制数转成int
* eg. "4DC0" -> 19904
*
* @param str 字符串形式的十六进制数
* @return 对应的整型数据
*/
private static int hexStrToInt(String str) {
int result = 0;
for (char c : str.toCharArray()) {
if (!map.containsKey(c)) {
throw new RuntimeException("invalid char");
}
result = (result << 4) | map.get(c);
}
return result;
}
/**
* 携带有unicode编码的信息整型数据 (int有32bit,unicode只用到低位24个bit)
* 转成对应的utf-8的字符
* <p>
* 注意这里仅传入单个unicode编码, 传出单个utf8编码的字符
*
* @param unicode 单个unicode编码
* @return 单个utf8编码的字符
*/
private static String unicodeToUtf8Str(int unicode) {
if (unicode >= 0 && unicode <= 0x7F) {
byte b = (byte) unicode;
byte[] bytes = new byte[1];
bytes[0] = b;
return new String(bytes, StandardCharsets.UTF_8);
} else if (unicode > 0x7F && unicode <= 0x7FF) {
int temp = unicode & 0b11111111111;
byte b1 = (byte) (0b11000000 | (temp >> 6));
byte b2 = (byte) (0b10000000 | (temp & 0b111111));
byte[] bytes = new byte[2];
bytes[0] = b1;
bytes[1] = b2;
return new String(bytes, StandardCharsets.UTF_8);
} else if (unicode > 0x7FF && unicode <= 0xFFFF) {
int temp = unicode & 0b1111111111111111;
byte b1 = (byte) (0b11100000 | (temp >> 12));
byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
byte b3 = (byte) (0b10000000 | (temp & 0b111111));
byte[] bytes = new byte[3];
bytes[0] = b1;
bytes[1] = b2;
bytes[2] = b3;
return new String(bytes, StandardCharsets.UTF_8);
} else if (unicode > 0x10000 && unicode <= 0x10FFFF) {
int temp = unicode & 0b111111111111111111111;
byte b1 = (byte) (0b11110000 | (temp >> 18));
byte b2 = (byte) (0b10000000 | ((temp & 0b111111000000000000) >> 12));
byte b3 = (byte) (0b10000000 | ((temp & 0b111111000000) >> 6));
byte b4 = (byte) (0b10000000 | (temp & 0b111111));
byte[] bytes = new byte[4];
bytes[0] = b1;
bytes[1] = b2;
bytes[2] = b3;
bytes[3] = b4;
return new String(bytes, StandardCharsets.UTF_8);
} else {
throw new RuntimeException("error unicode");
}
}
/**
* int转byte数组, int高位的部分存在数组低位的槽中
* eg. 19904 -> new byte[] {0b00000000, 0b00000000, 0b01001101, 0b11000000}
*
* @param a 整型数据
* @return byte数组
*/
private static byte[] intToByteArrayHighFirst(int a) {
byte[] b = new byte[4];
b[3] = (byte) (a & 0xff);
b[2] = (byte) (a >> 8 & 0xff);
b[1] = (byte) (a >> 16 & 0xff);
b[0] = (byte) (a >> 24 & 0xff);
return b;
}
}