Textract
# Using R with Amazon Web Services for document analysis
# https://aws.amazon.com/blogs/opensource/using-r-with-amazon-web-services-for-document-analysis/
# This R program shows how you can use AWS with R to create a data pipeline for
# extracting data from PDFs for future processing, using Textract, S3,
# Relational Database Service (RDS).
# The PDF document is from the Greenbook projections, a set of economic
# projections made by the Federal Reserve from 1966 to the present. They are
# available as a collection of PDFs from the Philadelphia Federal Reserve at:
# https://www.philadelphiafed.org/research-and-data/real-time-center/greenbook-data/pdf-data-set
# This R program expects that you have:
# - An S3 bucket with PDFs
# - An RDS PostgreSQL database with IAM authentication
# - A user, instance, or container with IAM permissions to access Textract
# and your RDS database
# - R, and the packages paws, DBI, and RPostgres
#-------------------------------------------------------------------------------
# The location of the first Greenbook Projections PDF document in S3.
bucket <- "my-bucket"
file <- "GS-1966-01-11.pdf"
textract <- paws::textract()
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Get a PDF document's tables using Amazon Textract.
# --------------------------------------------------
analyze_document <- function(bucket, file) {
# Start analyzing the PDF.
resp <- textract$start_document_analysis(
DocumentLocation = list(
S3Object = list(Bucket = bucket, Name = file)
),
FeatureTypes = "TABLES"
)
# Check that the analysis is done and get the result.
count <- 0
while (count < 30 && (!exists("result") || result$JobStatus == "IN_PROGRESS")) {
Sys.sleep(1)
result <- textract$get_document_analysis(
JobId = resp$JobId
)
# If the result has multiple parts, get the remaining parts.
next_token <- result$NextToken
while (length(next_token) > 0) {
next_result <- textract$get_document_analysis(
JobId = resp$JobId,
NextToken = next_token
)
result$Blocks <- c(result$Blocks, next_result$Blocks)
next_token <- next_result$NextToken
}
count <- count + 1
}
return(result)
}
analysis <- analyze_document(bucket, file)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Turn the Textract analysis result into matrices.
# ------------------------------------------------
# Get all children for a given block.
get_children <- function(block, data) {
if (length(block$Relationships) == 0) {
return(list())
}
idx <- which(sapply(block$Relationships, function(x) x$Type) == "CHILD")
if (!idx) {
return(list())
}
child_ids <- block$Relationships[[idx]]$Ids
result <- data[child_ids]
return(result)
}
# Get all tables for a given document analysis returned by `analyze_document`.
get_tables <- function(analysis) {
blocks <- analysis$Blocks
names(blocks) <- sapply(blocks, function(x) x$Id)
tables <- list()
for (block in blocks) {
if (block$BlockType == "TABLE") {
cells <- get_children(block, blocks)
rows <- max(sapply(cells, function(x) x$RowIndex))
cols <- max(sapply(cells, function(x) x$ColumnIndex))
table <- matrix(nrow = rows, ncol = cols)
# 1. Go through a table's cells one-by-one
for (cell in cells) {
# 2. Get the cell's contents
words <- get_children(cell, blocks)
text <- paste(sapply(words, function(x) x$Text), collapse = " ")
# 3. Insert the cell contents into the matrix
row <- cell$RowIndex
col <- cell$ColumnIndex
table[row, col] <- text
}
tables <- c(tables, list(table))
}
}
return(tables)
}
tables <- get_tables(analysis)
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
#-------------------------------------------------------------------------------
# Upload our results to our database.
# -----------------------------------
# Connect to the database using an IAM authentication token.
rds <- paws::rds()
token <- rds$build_auth_token("myhost:5432", "us-east-1", "david")
con <- DBI::dbConnect(
RPostgres::Postgres(),
host = "myhost", port = 5432, dbname = "mydb",
user = "david", password = token
)
# Create rows for each table to insert into the database.
database_rows <- data.frame(
document = "GS-1966-01-11.pdf",
table_num = 1:length(tables),
data = sapply(tables, jsonlite::toJSON)
)
# Store the tables in the database.
DBI::dbAppendTable(con, name = "tables", value = database_rows)