当前位置: 首页 > news >正文

python ubuntu dlib 10- 生成自然语言向量

import sys
import dlib

计算特征向量:

def sentence_to_vectors(sentence):
    # Create an empty array of vectors
    vects = dlib.vectors()
    for word in sentence.split():
        # Our vectors are very simple 1-dimensional vectors.  The value of the
        # single feature is 1 if the first letter of the word is capitalized and
        # 0 otherwise.
        if word[0].isupper():
            vects.append(dlib.vector([1]))
        else:
            vects.append(dlib.vector([0]))
    return vects

计算稀疏向量表达式(当特征向量包含太多的0时,这个版本更为高效):

def sentence_to_sparse_vectors(sentence):
    vects = dlib.sparse_vectors()
    has_cap = dlib.sparse_vector()
    no_cap = dlib.sparse_vector()
    # make has_cap equivalent to dlib.vector([1])
    has_cap.append(dlib.pair(0, 1))

    # Since we didn't add anything to no_cap it is equivalent to
    # dlib.vector([0])
    for word in sentence.split():
        if word[0].isupper():
            vects.append(has_cap)
        else:
            vects.append(no_cap)
    return vects

训练:

def print_segment(sentence, names):
    words = sentence.split()
    for name in names:
        for i in name:
            sys.stdout.write(words[i] + " ")
        sys.stdout.write("\n")



# Now let's make some training data.  Each example is a sentence as well as a
# set of ranges which indicate the locations of any names.   
names = dlib.ranges()     # make an array of dlib.range objects.
segments = dlib.rangess() # make an array of arrays of dlib.range objects.
sentences = []

sentences.append("The other day I saw a man named Jim Smith")
# We want to detect person names.  So we note that the name is located within
# the range [8, 10).  Note that we use half open ranges to identify segments.
# So in this case, the segment identifies the string "Jim Smith".
names.append(dlib.range(8, 10))
segments.append(names)
names.clear() # make names empty for use again below

sentences.append("Davis King is the main author of the dlib Library")
names.append(dlib.range(0, 2))
segments.append(names)
names.clear()

sentences.append("Bob Jones is a name and so is George Clinton")
names.append(dlib.range(0, 2))
names.append(dlib.range(8, 10))
segments.append(names)
names.clear()

sentences.append("My dog is named Bob Barker")
names.append(dlib.range(4, 6))
segments.append(names)
names.clear()

sentences.append("ABC is an acronym but John James Smith is a name")
names.append(dlib.range(5, 8))
segments.append(names)
names.clear()

sentences.append("No names in this sentence at all")
segments.append(names)
names.clear()


# Now before we can pass these training sentences to the dlib tools we need to
# convert them into arrays of vectors as discussed above.  We can use either a
# sparse or dense representation depending on our needs.  In this example, we
# show how to do it both ways.
use_sparse_vects = False
if use_sparse_vects:
    # Make an array of arrays of dlib.sparse_vector objects.
    training_sequences = dlib.sparse_vectorss()
    for s in sentences:
        training_sequences.append(sentence_to_sparse_vectors(s))
else:
    # Make an array of arrays of dlib.vector objects.
    training_sequences = dlib.vectorss()
    for s in sentences:
        training_sequences.append(sentence_to_vectors(s))

 

调用模型:

params = dlib.segmenter_params()
params.window_size = 3
params.use_high_order_features = True
params.use_BIO_model = True
# This is the common SVM C parameter.  Larger values encourage the trainer to
# attempt to fit the data exactly but might overfit.  In general, you determine
# this parameter by cross-validation.
params.C = 10

# Train a model.  The model object is responsible for predicting the locations
# of names in new sentences.
model = dlib.train_sequence_segmenter(training_sequences, segments, params)

# Let's print out the things the model thinks are names.  The output is a set
# of ranges which are predicted to contain names.  If you run this example
# program you will see that it gets them all correct.
for i, s in enumerate(sentences):
    print_segment(s, model(training_sequences[i]))

# Let's also try segmenting a new sentence.  This will print out "Bob Bucket".
# Note that we need to remember to use the same vector representation as we used
# during training.
test_sentence = "There once was a man from Nantucket " \
                "whose name rhymed with Bob Bucket"
if use_sparse_vects:
    print_segment(test_sentence,
                  model(sentence_to_sparse_vectors(test_sentence)))
else:
    print_segment(test_sentence, model(sentence_to_vectors(test_sentence)))

# We can also measure the accuracy of a model relative to some labeled data.
# This statement prints the precision, recall, and F1-score of the model
# relative to the data in training_sequences/segments.
print("Test on training data: {}".format(
      dlib.test_sequence_segmenter(model, training_sequences, segments)))

# We can also do 5-fold cross-validation and print the resulting precision,
# recall, and F1-score.
print("Cross validation: {}".format(
      dlib.cross_validate_sequence_segmenter(training_sequences, segments, 5,
                                             params)))

 

相关文章:

  • VS2010无法运行与调试Silverlight 3应用程序
  • python ubuntu dlib 人脸识别11-物体追踪
  • 何必言精通——十年杂感 兼谈其它
  • Azure部署nodejs webapp服务无法启动常见原因
  • 解读《第二十四次互联网报告》
  • 使用.NET SFTP 登陆linux上传下载文件
  • c# 使用7zip
  • Ubuntu source list
  • C# EF动态获取连接字符串的MSDTC配置
  • ubuntu 安装python3和open cv
  • Linux下类FreeBSD uprintf实现
  • ubuntu配置ssh服务
  • 软件项目一直赶工期,越改越忙,怎么破?
  • 迁移Blog平台
  • 使用Python face_recognition 人脸识别 - 1
  • 实现windows 窗体的自己画,网上摘抄的,学习了
  • 【comparator, comparable】小总结
  • axios请求、和返回数据拦截,统一请求报错提示_012
  • C# 免费离线人脸识别 2.0 Demo
  • ECMAScript 6 学习之路 ( 四 ) String 字符串扩展
  • Java编程基础24——递归练习
  • Java多线程(4):使用线程池执行定时任务
  • JS实现简单的MVC模式开发小游戏
  • magento2项目上线注意事项
  • PV统计优化设计
  • Redash本地开发环境搭建
  • Spring Security中异常上抛机制及对于转型处理的一些感悟
  • 机器学习中为什么要做归一化normalization
  • 解决jsp引用其他项目时出现的 cannot be resolved to a type错误
  • 开源地图数据可视化库——mapnik
  • 聊聊hikari连接池的leakDetectionThreshold
  • 全栈开发——Linux
  • 三栏布局总结
  • 深度学习在携程攻略社区的应用
  • mysql面试题分组并合并列
  • 如何用纯 CSS 创作一个菱形 loader 动画
  • # 数论-逆元
  • ###C语言程序设计-----C语言学习(3)#
  • $ git push -u origin master 推送到远程库出错
  • ${factoryList }后面有空格不影响
  • (html转换)StringEscapeUtils类的转义与反转义方法
  • (八)c52学习之旅-中断实验
  • (独孤九剑)--文件系统
  • (十三)Maven插件解析运行机制
  • (转)Linq学习笔记
  • (转)MVC3 类型“System.Web.Mvc.ModelClientValidationRule”同时存在
  • **CI中自动类加载的用法总结
  • .net core webapi 部署iis_一键部署VS插件:让.NET开发者更幸福
  • .NET Core/Framework 创建委托以大幅度提高反射调用的性能
  • .net 验证控件和javaScript的冲突问题
  • .net 逐行读取大文本文件_如何使用 Java 灵活读取 Excel 内容 ?
  • .NET/ASP.NETMVC 深入剖析 Model元数据、HtmlHelper、自定义模板、模板的装饰者模式(二)...
  • .NET6 命令行启动及发布单个Exe文件
  • .net专家(高海东的专栏)
  • [ vulhub漏洞复现篇 ] Apache Flink目录遍历(CVE-2020-17519)