当前位置：首页 > news >正文

python ubuntu dlib 10- 生成自然语言向量

news 来源：原创 2024/4/29 9:22:55

import sys
import dlib

计算特征向量：

def sentence_to_vectors(sentence):
    # Create an empty array of vectors
    vects = dlib.vectors()
    for word in sentence.split():
        # Our vectors are very simple 1-dimensional vectors.  The value of the
        # single feature is 1 if the first letter of the word is capitalized and
        # 0 otherwise.
        if word[0].isupper():
            vects.append(dlib.vector([1]))
        else:
            vects.append(dlib.vector([0]))
    return vects

计算稀疏向量表达式（当特征向量包含太多的0时，这个版本更为高效)：

def sentence_to_sparse_vectors(sentence):
    vects = dlib.sparse_vectors()
    has_cap = dlib.sparse_vector()
    no_cap = dlib.sparse_vector()
    # make has_cap equivalent to dlib.vector([1])
    has_cap.append(dlib.pair(0, 1))

    # Since we didn't add anything to no_cap it is equivalent to
    # dlib.vector([0])
    for word in sentence.split():
        if word[0].isupper():
            vects.append(has_cap)
        else:
            vects.append(no_cap)
    return vects

训练：

def print_segment(sentence, names):
    words = sentence.split()
    for name in names:
        for i in name:
            sys.stdout.write(words[i] + " ")
        sys.stdout.write("\n")



# Now let's make some training data.  Each example is a sentence as well as a
# set of ranges which indicate the locations of any names.   
names = dlib.ranges()     # make an array of dlib.range objects.
segments = dlib.rangess() # make an array of arrays of dlib.range objects.
sentences = []

sentences.append("The other day I saw a man named Jim Smith")
# We want to detect person names.  So we note that the name is located within
# the range [8, 10).  Note that we use half open ranges to identify segments.
# So in this case, the segment identifies the string "Jim Smith".
names.append(dlib.range(8, 10))
segments.append(names)
names.clear() # make names empty for use again below

sentences.append("Davis King is the main author of the dlib Library")
names.append(dlib.range(0, 2))
segments.append(names)
names.clear()

sentences.append("Bob Jones is a name and so is George Clinton")
names.append(dlib.range(0, 2))
names.append(dlib.range(8, 10))
segments.append(names)
names.clear()

sentences.append("My dog is named Bob Barker")
names.append(dlib.range(4, 6))
segments.append(names)
names.clear()

sentences.append("ABC is an acronym but John James Smith is a name")
names.append(dlib.range(5, 8))
segments.append(names)
names.clear()

sentences.append("No names in this sentence at all")
segments.append(names)
names.clear()


# Now before we can pass these training sentences to the dlib tools we need to
# convert them into arrays of vectors as discussed above.  We can use either a
# sparse or dense representation depending on our needs.  In this example, we
# show how to do it both ways.
use_sparse_vects = False
if use_sparse_vects:
    # Make an array of arrays of dlib.sparse_vector objects.
    training_sequences = dlib.sparse_vectorss()
    for s in sentences:
        training_sequences.append(sentence_to_sparse_vectors(s))
else:
    # Make an array of arrays of dlib.vector objects.
    training_sequences = dlib.vectorss()
    for s in sentences:
        training_sequences.append(sentence_to_vectors(s))

调用模型：

params = dlib.segmenter_params()
params.window_size = 3
params.use_high_order_features = True
params.use_BIO_model = True
# This is the common SVM C parameter.  Larger values encourage the trainer to
# attempt to fit the data exactly but might overfit.  In general, you determine
# this parameter by cross-validation.
params.C = 10

# Train a model.  The model object is responsible for predicting the locations
# of names in new sentences.
model = dlib.train_sequence_segmenter(training_sequences, segments, params)

# Let's print out the things the model thinks are names.  The output is a set
# of ranges which are predicted to contain names.  If you run this example
# program you will see that it gets them all correct.
for i, s in enumerate(sentences):
    print_segment(s, model(training_sequences[i]))

# Let's also try segmenting a new sentence.  This will print out "Bob Bucket".
# Note that we need to remember to use the same vector representation as we used
# during training.
test_sentence = "There once was a man from Nantucket " \
                "whose name rhymed with Bob Bucket"
if use_sparse_vects:
    print_segment(test_sentence,
                  model(sentence_to_sparse_vectors(test_sentence)))
else:
    print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))

# We can also measure the accuracy of a model relative to some labeled data.
# This statement prints the precision, recall, and F1-score of the model
# relative to the data in training_sequences/segments.
print("Test on training data: {}".format(
      dlib.test_sequence_segmenter(model, training_sequences, segments)))

# We can also do 5-fold cross-validation and print the resulting precision,
# recall, and F1-score.
print("Cross validation: {}".format(
      dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,
                                             params)))