当前位置：首页 > news >正文

NLP基础及其代码-tokenizer

news 来源：原创 2024/9/21 3:14:25

基础知识

NLP-分词器：SentencePiece【参考Chinese-LLaMA-Alpaca在通用中文语料上训练的20K中文词表并与原版LLaMA模型的32K词表进行合并的代码】_sentencepiece 中文训练-CSDN博客

【OpenLLM 008】大模型基础组件之分词器-万字长文全面解读LLM中的分词算法与分词器（tokenization & tokenizers）：BPE/WordPiece/ULM & beyond - 知乎 (zhihu.com)

BPE

Byte Pair Encoding

步骤：

1.语料库
2.确定词表大小
3.为每个单词添加分割符
4.从字母开始迭代合并出现频率高的字符串

"""
2024/9/9
bpe.py
wang_yi
"""
import re, collectionsdef get_vocab(filename):vocab = collections.defaultdict(int)  # 对于 defaultdict(int) 创建的字典来说，任何不存在的键在访问时都会自动被赋值为 0with open(filename, 'r', encoding='utf-8') as fhand:for line in fhand:words = line.strip().split()for word in words:vocab[' '.join(list(word)) + ' </w>'] += 1return vocabdef get_stats(vocab):pairs = collections.defaultdict(int)for word, freq in vocab.items():symbols = word.split()for i in range(len(symbols)-1):pairs[symbols[i],symbols[i+1]] += freqreturn pairsdef merge_vocab(pair, v_in):v_out = {}bigram = re.escape(' '.join(pair))  # re.escape确保特殊字符被转义，以便在正则表达式中按照字面意义进行匹配p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')  # ?!负向前瞻断言，表示匹配位置前(<)或后面不是某种模式。\S: 表示任何非空白字符for word in v_in:w_out = p.sub(''.join(pair), word)  # 替换结合字符对，保留原来字符串中的其他部分v_out[w_out] = v_in[word]  # 将处理后的字符串 w_out 作为 v_out 字典的键，并将 v_in 字典中对应 word 的值赋给 v_out 字典中这个键return v_outdef get_tokens(vocab):tokens = collections.defaultdict(int)for word, freq in vocab.items():word_tokens = word.split()for token in word_tokens:tokens[token] += freqreturn tokens# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}# Get free book from Gutenberg
# wget http://www.gutenberg.org/cache/epub/16457/pg16457.txt
vocab = get_vocab('pg16457.txt')print('==========')
print('Tokens Before BPE')
tokens = get_tokens(vocab)
print('Tokens: {}'.format(tokens))
print('Number of tokens: {}'.format(len(tokens)))
print('==========')num_merges = 1000
for i in range(num_merges):pairs = get_stats(vocab)if not pairs:breakbest = max(pairs, key=pairs.get)vocab = merge_vocab(best, vocab)print('Iter: {}'.format(i))print('Best pair: {}'.format(best))tokens = get_tokens(vocab)print('Tokens: {}'.format(tokens))print('Number of tokens: {}'.format(len(tokens)))print('==========')

单词以</w>结尾

新分词

打印结果

Tokens Before BPE
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 26103, 'e': 59190, '</w>': 101849, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'c': 13900, 't': 44238, 'G': 300, 'u': 13723, 'n': 32498, 'b': 7426, 'g': 8752, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 103
==========
Iter: 0
Best pair: ('e', '</w>')
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 26103, 'e</w>': 17758, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'e': 41432, 'c': 13900, 't': 44238, '</w>': 84091, 'G': 300, 'u': 13723, 'n': 32498, 'b': 7426, 'g': 8752, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 104
==========
Iter: 1
Best pair: ('t', 'h')
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 12062, 'e</w>': 17758, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'e': 41432, 'c': 13900, 't': 30197, '</w>': 84091, 'G': 300, 'u': 13723, 'n': 32498, 'b': 7426, 'g': 8752, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'th': 14041, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 105
==========
Iter: 2
Best pair: ('t', '</w>')
Tokens: defaultdict(<class 'int'>, {'T': 1607, 'h': 12062, 'e</w>': 17758, 'P': 780, 'r': 29562, 'o': 35007, 'j': 858, 'e': 41432, 'c': 13900, 't</w>': 9280, 'G': 300, 'u': 13723, 't': 20917, 'n': 32498, 'b': 7426, 'g': 8752, '</w>': 74811, 'B': 1162, 'k': 2732, 'f': 10463, 'A': 1379, 'l': 20619, 'd': 17581, 'th': 14041, 'M': 1204, 'i': 31414, 's': 28311, 'a': 36695, 'y': 8828, 'w': 8155, 'U': 178, 'S': 865, 'm': 9751, 'p': 8030, 'v': 4878, '.': 4061, 'Y': 250, ',': 8065, '-': 1063, 'L': 426, 'I': 1428, ':': 201, 'J': 78, 'V': 102, 'E': 895, 'R': 369, '6': 73, '2': 160, '0': 402, '5': 124, '[': 32, '#': 1, '1': 291, '4': 99, '7': 60, ']': 32, 'D': 322, 'C': 862, 'K': 41, 'O': 510, '/': 31, '*': 22, 'F': 419, 'H': 688, 'N': 793, '"': 4064, '!': 1214, 'W': 576, '3': 104, "'": 1236, 'Q': 33, 'X': 49, 'Z': 10, '?': 651, '8': 73, '9': 36, '_': 1426, 'à': 3, 'x': 937, 'z': 364, '°': 41, 'q': 575, ';': 561, '(': 53, ')': 53, '{': 23, '}': 16, 'è': 2, 'é': 14, '+': 2, '=': 3, 'ö': 2, 'ê': 5, 'â': 1, 'ô': 1, 'Æ': 3, 'æ': 2, '—': 2, '™': 57, '“': 11, '”': 11, '•': 4, '%': 1, '‘': 1, '’': 6, '$': 2})
Number of tokens: 106
==========

可以保存生成的分词，根据分词列表对输入的文本进行分词

"""
2024/9/9
bpe_code.py
wang_yi
"""
import re, collectionsdef get_vocab(filename):vocab = collections.defaultdict(int)with open(filename, 'r', encoding='utf-8') as fhand:for line in fhand:words = line.strip().split()for word in words:vocab[' '.join(list(word)) + ' </w>'] += 1return vocabdef get_stats(vocab):pairs = collections.defaultdict(int)for word, freq in vocab.items():symbols = word.split()for i in range(len(symbols) - 1):pairs[symbols[i], symbols[i + 1]] += freqreturn pairsdef merge_vocab(pair, v_in):v_out = {}bigram = re.escape(' '.join(pair))p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')for word in v_in:w_out = p.sub(''.join(pair), word)v_out[w_out] = v_in[word]return v_outdef get_tokens_from_vocab(vocab):tokens_frequencies = collections.defaultdict(int)vocab_tokenization = {}for word, freq in vocab.items():word_tokens = word.split()for token in word_tokens:tokens_frequencies[token] += freqvocab_tokenization[''.join(word_tokens)] = word_tokensreturn tokens_frequencies, vocab_tokenizationdef measure_token_length(token):if token[-4:] == '</w>':return len(token[:-4]) + 1else:return len(token)def tokenize_word(string, sorted_tokens, unknown_token='</u>'):if string == '':return []if sorted_tokens == []:return [unknown_token]string_tokens = []for i in range(len(sorted_tokens)):token = sorted_tokens[i]token_reg = re.escape(token.replace('.', '[.]'))matched_positions = [(m.start(0), m.end(0)) for m in re.finditer(token_reg, string)]if len(matched_positions) == 0:continuesubstring_end_positions = [matched_position[0] for matched_position in matched_positions]substring_start_position = 0for substring_end_position in substring_end_positions:substring = string[substring_start_position:substring_end_position]string_tokens += tokenize_word(string=substring, sorted_tokens=sorted_tokens[i + 1:],unknown_token=unknown_token)string_tokens += [token]substring_start_position = substring_end_position + len(token)remaining_substring = string[substring_start_position:]string_tokens += tokenize_word(string=remaining_substring, sorted_tokens=sorted_tokens[i + 1:],unknown_token=unknown_token)breakreturn string_tokens# vocab = {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w e s t </w>': 6, 'w i d e s t </w>': 3}vocab = get_vocab('pg16457.txt')print('==========')
print('Tokens Before BPE')
tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)
print('All tokens: {}'.format(tokens_frequencies.keys()))
print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))
print('==========')num_merges = 100
for i in range(num_merges):pairs = get_stats(vocab)if not pairs:breakbest = max(pairs, key=pairs.get)vocab = merge_vocab(best, vocab)print('Iter: {}'.format(i))print('Best pair: {}'.format(best))tokens_frequencies, vocab_tokenization = get_tokens_from_vocab(vocab)print('All tokens: {}'.format(tokens_frequencies.keys()))print('Number of tokens: {}'.format(len(tokens_frequencies.keys())))print('==========')# Let's check how tokenization will be for a known word
word_given_known = 'mountains</w>'
word_given_unknown = 'Ilikeeatingapples!</w>'sorted_tokens_tuple = sorted(tokens_frequencies.items(), key=lambda item: (measure_token_length(item[0]), item[1]),reverse=True)  # 根据 measure_token_length(item[0]) 排序，如果长度相同，则按 item[1]（值）排序。reverse=True 降序排序
sorted_tokens = [token for (token, freq) in sorted_tokens_tuple]print(sorted_tokens)word_given = word_given_knownprint('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:print('Tokenization of the known word:')print(vocab_tokenization[word_given])print('Tokenization treating the known word as unknown:')print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:print('Tokenizating of the unknown word:')print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))word_given = word_given_unknownprint('Tokenizing word: {}...'.format(word_given))
if word_given in vocab_tokenization:print('Tokenization of the known word:')print(vocab_tokenization[word_given])print('Tokenization treating the known word as unknown:')print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))
else:print('Tokenizating of the unknown word:')print(tokenize_word(string=word_given, sorted_tokens=sorted_tokens, unknown_token='</u>'))

字节对编码 - 毛雷日志 (leimao.github.io)
彻底搞懂BPE（Byte Pair Encode）原理（附代码实现）_bpe实现-CSDN博客

问题：

一个词可能存在多种拆分方式，对于算法来说，难以评估使用那个拆分方式比较合理，可以组合的列表中的优先级无法确定，通常会直接取第一个。如：

encode一个句子"linear algebra", 那么存在的划分方法有以下几种：

linear = li + near 或者 li + n + ea + r

algebra = al + ge + bra 或者 al + g + e + bra

在这个具体的case中，每个单词都有两种拆分方法，那么整个句子就有4中拆分方法。

解决方式——>在merge的时候考虑merge前后的影响到底有多大

wordpiece

WordPiece基于概率生成新的subword而不是最高频字节对。

WordPiece和BPE的区别就在每次merge的过程中， BPE是通过合并最高频次的字节对，WordPiece选择能够提升语言模型概率最大的相邻子词加入词表

假设句子S=(t1,t2,...,tn)，是由n个子词组成，ti表示子词，且假设各个子词之间是独立存在的，则句子S的语言模型似然值等价与所有子词概率的乘积：

$\log P(S) = \sum_{i=1}^n \log P(t_i)$

把相邻位置的x和y两个子词进行合并，合并后产生的子词为z，此时句子S似然值的变化可表示为

选择让似然概率最大的值，具体的计算使用合并后的概率值，除以合并前的概率值，举个例子, 在考虑将"e"和"s"合并的时候除了会考虑"es"的概率值，还会考虑"e"和"s"的概率值。或者说，"es"的合并是通过考虑合并带来的价值。

步骤：

1.语料库
2.确定词表大小
3.为每个单词添加分割符
4.从字母开始迭代合并前后概率变化最大的字符串

在编码时从头开始查找最长的子词

"""
2024/9/10
wordpiece.py
wang_yi
"""
corpus = ["This is the Hugging Face course.","This chapter is about tokenization.","This section shows several tokenizer algorithms.","Hopefully, you will be able to understand how they are trained and generate tokens.",
]
from transformers import AutoTokenizer
from collections import defaultdict
# 在進行預標記化時計算語料庫中每個單詞的頻率:
tokenizer = AutoTokenizer.from_pretrained(r"G:\llm_model\bert-base-chinese")word_freqs = defaultdict(int)
for text in corpus:words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)new_words = [word for word, offset in words_with_offsets]for word in new_words:word_freqs[word] += 1print(word_freqs)
# 字母表是由單詞的所有第一個字母組成的唯一集合,以及出現在前綴為 ## 的其他字母:
alphabet = []
for word in word_freqs.keys():if word[0] not in alphabet:alphabet.append(word[0])for letter in word[1:]:if f"##{letter}" not in alphabet:alphabet.append(f"##{letter}")alphabet.sort()
# alphabet
print(alphabet)
# 在該詞彙表的開頭添加了模型使用的特殊標記。在使用 BERT 的情況下,它是列表 ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]:
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
# 需要拆分每個單詞, 所有不是第一個字母的字母都以 ## 為前綴:
splits = {word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]for word in word_freqs.keys()
}# 計算每對的分數
def compute_pair_scores(splits):letter_freqs = defaultdict(int)pair_freqs = defaultdict(int)for word, freq in word_freqs.items():split = splits[word]if len(split) == 1:letter_freqs[split[0]] += freqcontinuefor i in range(len(split) - 1):pair = (split[i], split[i + 1])letter_freqs[split[i]] += freqpair_freqs[pair] += freqletter_freqs[split[-1]] += freqscores = {pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])for pair, freq in pair_freqs.items()}return scorespair_scores = compute_pair_scores(splits)
for i, key in enumerate(pair_scores.keys()):print(f"{key}: {pair_scores[key]}")if i >= 5:breakbest_pair = ""
max_score = None
for pair, score in pair_scores.items():if max_score is None or max_score < score:best_pair = pairmax_score = scoreprint(best_pair, max_score)
# 第一個要學習的合併是 ('a', '##b') -> 'ab', 並且我們添加 'ab' 到詞彙表中:
vocab.append("ab")def merge_pair(a, b, splits):for word in word_freqs:split = splits[word]if len(split) == 1:continuei = 0while i < len(split) - 1:if split[i] == a and split[i + 1] == b:merge = a + b[2:] if b.startswith("##") else a + bsplit = split[:i] + [merge] + split[i + 2:]else:i += 1splits[word] = splitreturn splitssplits = merge_pair("a", "##b", splits)
print(splits["about"])vocab_size = 70
while len(vocab) < vocab_size:scores = compute_pair_scores(splits)best_pair, max_score = "", Nonefor pair, score in scores.items():if max_score is None or max_score < score:best_pair = pairmax_score = scoresplits = merge_pair(*best_pair, splits)new_token = (best_pair[0] + best_pair[1][2:]if best_pair[1].startswith("##")else best_pair[0] + best_pair[1])vocab.append(new_token)
print(vocab)def encode_word(word):tokens = []while len(word) > 0:i = len(word)while i > 0 and word[:i] not in vocab:i -= 1if i == 0:return ["[UNK]"]tokens.append(word[:i])word = word[i:]if len(word) > 0:word = f"##{word}"return tokensprint(encode_word("Hugging"))
print(encode_word("HOgging"))def tokenize(text):pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)pre_tokenized_text = [word for word, offset in pre_tokenize_result]encoded_words = [encode_word(word) for word in pre_tokenized_text]return sum(encoded_words, [])tokenize("This is the Hugging Face course!")

标准化和预标记化 - Hugging Face NLP Course

运行结果：

defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1, ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})
['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']
('T', '##h'): 0.125
('##h', '##i'): 0.03409090909090909
('##i', '##s'): 0.02727272727272727
('i', '##s'): 0.1
('t', '##h'): 0.03571428571428571
('##h', '##e'): 0.011904761904761904
('a', '##b') 0.2
['ab', '##o', '##u', '##t']
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', 'ab', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', '##hm', '##thm', 'Hu', 'Hug', 'Hugg', 'ch', 'cha', 'chap', 'chapt', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut', '##ta']
['Hugg', '##i', '##n', '##g']
['[UNK]']
['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]']

单词中首字母不变，中间分词以##连接

Unigram Language Model(ULM)

与 BPE 和 WordPiece 相比,Unigram 在另一个方向上工作:它从一个较大的词汇表开始,然后从中删除标记,直到达到所需的词汇表大小。请注意,从不删除基本字符,以确保可以标记任何单词。

在训练的每一步,Unigram 算法都会在给定当前词汇的情况下计算语料库的损失。然后,遍历词汇表中的每个分词,分别计算如果删除该分词,整体损失会增加多少。寻找损失增加最少的分词，这些分词对语料库的整体损失影响较小,因此从某种意义上说,它们“不太需要”并且是移除的最佳候选者。

步骤：

1.建立一个足够大的词表。
2.求当前词表每个分词在语料上的概率。
3.根据词表，计算对语料最优分割下的loss。
4.遍历删除分词，计算删除该分词后对语料最优分割下的loss。
5.根据删除某分词后增加的损失进行排序，按要求比例丢弃对损失无影响或影响较小的分词。单字符不能被丢弃，这是为了避免OOV情况。
6.重复步骤2到5，直到词表大小减少到设定范围。

举个例子：

语料库：

所有分词：

所有子词的出现次数：

给定分词的概率是它在原始语料库中的频率(我们找到它的次数),除以词汇表中所有分词的所有频率的总和(以确保概率总和为 1)。例如, "ug" 在 "hug" 、 "pug" 以及 "hugs" 中,所以它在我们的语料库中的频率为 20。所有频率之和为210, 并且子词 "ug" 出现的概率是 20/210

"pug" 的标记化 ["p", "u", "g"] 的概率为:

标记化 ["pu", "g"] 的概率为:

将一个单词分成尽可能少的分词拥有更高的概率

整体损失函数为 $-\sum num(x)logp(x)$
其中num(x)为当前单词在语料中出现的次数，p(x)为当前单词按某种分词方式分词的概率。

对于下面的分词方式：

所以损失是:
$10 * (-log(0.071428)) + 5 * (-log(0.007710)) + 12 * (-log(0.006168)) + 4 * (-log(0.001451)) + 5 * (-log(0.001701)) = 169.8$

对每种分词方式计算去掉该分词后，最优分割方式下整体损失的变化

如将hug分词去掉，

按照hu g分割为最优分割

计算整体loss的变化值=hugs单词个数*(- (-log(0.071428)) + (-log(0.006802))) = 23.5

"""
2024/9/10
ulm.py
wang_yi
"""
# import os
# os.environ['ALL_PROXY'] = 'http://127.0.0.1:7890'corpus = ["This is the Hugging Face Course.","This chapter is about tokenization.","This section shows several tokenizer algorithms.","Hopefully, you will be able to understand how they are trained and generate tokens.",
]
from transformers import AutoTokenizer
from collections import defaultdicttokenizer = AutoTokenizer.from_pretrained(r"G:\llm_model\xlnet-base-cased")word_freqs = defaultdict(int)
for text in corpus:words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)new_words = [word for word, offset in words_with_offsets]for word in new_words:word_freqs[word] += 1print(word_freqs)
# 字符分词
char_freqs = defaultdict(int)
# 除字符分词外所有分词
subwords_freqs = defaultdict(int)
for word, freq in word_freqs.items():for i in range(len(word)):char_freqs[word[i]] += freq# Loop through the subwords of length at least 2for j in range(i + 2, len(word) + 1):subwords_freqs[word[i:j]] += freq# Sort subwords by frequency
sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)  # items返回key和value
print(sorted_subwords[:10])token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}from math import logtotal_sum = sum([freq for token, freq in token_freqs.items()])
# 分词出现的概率
model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}def encode_word(word, model):best_segmentations = [{"start": 0, "score": 1}] + [{"start": None, "score": None} for _ in range(len(word))  # hopefully This]for start_idx in range(len(word)):# This should be properly filled by the previous steps of the loopbest_score_at_start = best_segmentations[start_idx]["score"]for end_idx in range(start_idx + 1, len(word) + 1):token = word[start_idx:end_idx]if token in model and best_score_at_start is not None:score = model[token] + best_score_at_start# If we have found a better segmentation ending at end_idx, we updateif (best_segmentations[end_idx]["score"] is Noneor best_segmentations[end_idx]["score"] > score):best_segmentations[end_idx] = {"start": start_idx, "score": score}segmentation = best_segmentations[-1]if segmentation["score"] is None:# We did not find a tokenization of the word -> unknownreturn ["<unk>"], Nonescore = segmentation["score"]start = segmentation["start"]end = len(word)tokens = []while start != 0:tokens.insert(0, word[start:end])next_start = best_segmentations[start]["start"]end = startstart = next_starttokens.insert(0, word[start:end])return tokens, scoreprint(encode_word("Hopefully", model))
print(encode_word("This", model))def compute_loss(model):loss = 0for word, freq in word_freqs.items():_, word_loss = encode_word(word, model)loss += freq * word_lossreturn lossprint(compute_loss(model))import copydef compute_scores(model):scores = {}model_loss = compute_loss(model)for token, score in model.items():# We always keep tokens of length 1if len(token) == 1:continuemodel_without_token = copy.deepcopy(model)_ = model_without_token.pop(token)scores[token] = compute_loss(model_without_token) - model_lossreturn scoresscores = compute_scores(model)
print(scores["ll"])
print(scores["his"])percent_to_remove = 0.1
while len(model) > 100:scores = compute_scores(model)sorted_scores = sorted(scores.items(), key=lambda x: x[1])# Remove percent_to_remove tokens with the lowest scores.for i in range(int(len(model) * percent_to_remove)):_ = token_freqs.pop(sorted_scores[i][0])total_sum = sum([freq for token, freq in token_freqs.items()])model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}def tokenize(text, model):words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)pre_tokenized_text = [word for word, offset in words_with_offsets]encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]return sum(encoded_words, [])print(tokenize("This is the Hugging Face course.", model))

将_作为单词开头

defaultdict(<class 'int'>, {'▁This': 3, '▁is': 2, '▁the': 1, '▁Hugging': 1, '▁Face': 1, '▁Course.': 1, '▁chapter': 1, '▁about': 1, '▁tokenization.': 1, '▁section': 1, '▁shows': 1, '▁several': 1, '▁tokenizer': 1, '▁algorithms.': 1, '▁Hopefully,': 1, '▁you': 1, '▁will': 1, '▁be': 1, '▁able': 1, '▁to': 1, '▁understand': 1, '▁how': 1, '▁they': 1, '▁are': 1, '▁trained': 1, '▁and': 1, '▁generate': 1, '▁tokens.': 1})
[('▁t', 7), ('is', 5), ('er', 5), ('▁a', 5), ('▁to', 4), ('to', 4), ('en', 4), ('▁T', 3), ('▁Th', 3), ('▁Thi', 3)]
(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402)
(['This'], 6.288267030694535)
413.10377642940875
6.376412403623874
0.0
['▁This', '▁is', '▁the', '▁Hugging', '▁Face', '▁', 'c', 'ou', 'r', 's', 'e', '.']