用R提取PDF文本并创建整洁的数据
#https://cloud.tencent.com/developer/article/1475059
getwd()
path=“G:/papers/papers/SCRNASEQ AND SPATIAL/codes”
dir.create(path)
setwd(path)
library(pdftools)
file=“G:/papers/papers/SCRNASEQ AND SPATIAL/A spatially restricted fibrotic niche in pulmonary fibrosis is sustained by M-CSF M-CSFR signalling in monocyte-derived alveolar macrophages.pdf”
pdf_info(pdf = file)
#当使用pdf_text提取文档内容时,全部内容都被提取为一个字符串向量,
#每页的内容都被单独放置于一个字符串中。帮助文档的PDF格式一共包含5页,所以这里会得到一个长度为5的字符串向量。
#有两种方式可用于查看提取的文本:可以直接将结果显示在console中(通过执行print(text)或直接运行text),也可以通过“[ ]”来指定显示某一页的内容。
#空白的位置都会以空格的字符格式显示,“\r\n”代表换行符号。提取文档内容的代码如下:
text<- pdf_text(file)
length(text)
class(text)
text[1]
#该文档无附件,所以会显示一个空列表:
pdf_attachments(file)
#文档中一共包含了6种字体,pdf_fonts会给出字体的名称、类型、是否嵌入文档中这三类信息,具体如下:
pdf_fonts(pdf = file)
text[59]
text[347]
library(pdftools)
#BiocManager::install(“glue”)
library(tidyverse)
text[59]
text[347]
UC_text <- text[59:347] %>%
readr::read_lines() #按行读入,给每一行的两端加了一个引号
head(UC_text,90)
head(UC_text)
str_trim(" String\t") #String
UC_text %>% str_trim() %>% strsplit(split = " ") #String
head(UC_text %>% str_trim())
nchar(UC_text[1])
length(UC_text[1])
paste(UC_text[1], sep = "*? * ", collapse = “_”)
str_split(UC_text[2],pattern =" " ) %>% grep(pattern = ‘’,value = TRUE)
UC_text[2]
b=as.vector(str_split(UC_text[2],pattern =" " ))
str(as.vector(str_split(UC_text[2],pattern =" " )))
str(b)
str_subset(str_split(UC_text[2],pattern =" " ),"1")
word(UC_text[2])
word(UC_text[2],sep = fixed(" "))
word(UC_text[2],1:100,sep = fixed(’ '))
UC_text <- text[59:347] %>%
readr::read_lines() #按行读入,给每一行的两端加了一个引号
head(UC_text,90)
head(UC_text)
word(UC_text[1],1:100) %>% grep(pattern = ‘.’,value = TRUE)
word(UC_text[2],1:100) %>% grep(pattern = ‘.’,value = TRUE)
length(UC_text)
pure_uctext=UC_text[2:length(UC_text)]
head(pure_uctext)
word(pure_uctext[1],1:100) %>% grep(pattern = ‘.’,value = TRUE) #点代表任意字符
mydata=data.frame()
dim(mydata)=c(17397,8)
myvector=vector()
mylist=list()
for (linenum in 1:length(pure_uctext)) {
#linenum=1
#myvector=union(myvector,word(pure_uctext[linenum],1:100) %>% grep(pattern = ‘.’,value = TRUE))
mylist[[linenum]]=word(pure_uctext[linenum],1:100) %>% grep(pattern = ‘.’,value = TRUE)
}
mylist[[17397]]
length(mylist)
mylist
mydata
class(mydata)
str(mydata)
table = data.frame(
姓名 = c(“张三”, “李四”,“王五”),
工号 = c(“001”,“002”,“003”),
月薪 = c(1000, 2000,3000)
)
用R提取PDF文本并创建整洁的数据
* ↩︎