I tried following the instructions in the answer given to the following question:
How to index a pdf file in Elasticsearch 5.0.0 with ingest-attachment plugin?
I couldn't find many examples of the JavaScript client for ElasticSearch, so here is what I have:
Create index
// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});
// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id',
body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});
Define Index Mapping going in Node:
var body = {
pdf:{
properties:{
title : {"type" : "keyword", "index" : "false"},
type : {"type" : "keyword", "index" : "false"},
"attachment.pdf" : {"type" : "keyword"}
}
}
}
client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})
Define Ingest Pipeline in Node cluster with PUT API
function addPipeline(){
client.ingest.putPipeline({
id: 'my-pipeline-id',
body: {
"description" : "parse pdfs and index into ES",
"processors" : [
{ "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
{ "remove" : { "field" : "pdf" } }
]
}
})
.then(function () {
console.log("putPipeline Resolved");
})
.catch(function (error) {
console.log("putPipeline error: " + error);
});
};
Before I try to upload a PDF, I checked that the index was created:
curl -XGET 'localhost:9200/_cat/indices?v&pretty'
Result:
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open .kibana EaUbEQCETVKQbYThrhPGaA 1 1 1 0 3.6kb 3.6kb
yellow open pdfs Z2SR-ApFR9SYsvY08tgSZw 5 1 1 0 4.6kb 4.6kb
When I try to index the PDF with the following mand, I get an error.
curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
"pdf": @/Users/user/path/to/pdf/file.pdf
}'
Error:
{"error":"Content-Type header [application/pdf] is not supported","status":406}
Is this because my PDF is not Base64 encoded or am I doing something else wrong? I am trying to create a digital library to search through PDFs.
UPDATE:
I encoded my pdf with:
openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file
recreated my index and ran the following mand on the base64_encoded_file:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file
And I got the following error:
Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}
I tried adding the file as a body:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
Error:
{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transportty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}
Halp
I tried following the instructions in the answer given to the following question:
How to index a pdf file in Elasticsearch 5.0.0 with ingest-attachment plugin?
I couldn't find many examples of the JavaScript client for ElasticSearch, so here is what I have:
Create index
// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});
// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id',
body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});
Define Index Mapping going in Node:
var body = {
pdf:{
properties:{
title : {"type" : "keyword", "index" : "false"},
type : {"type" : "keyword", "index" : "false"},
"attachment.pdf" : {"type" : "keyword"}
}
}
}
client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})
Define Ingest Pipeline in Node cluster with PUT API
function addPipeline(){
client.ingest.putPipeline({
id: 'my-pipeline-id',
body: {
"description" : "parse pdfs and index into ES",
"processors" : [
{ "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
{ "remove" : { "field" : "pdf" } }
]
}
})
.then(function () {
console.log("putPipeline Resolved");
})
.catch(function (error) {
console.log("putPipeline error: " + error);
});
};
Before I try to upload a PDF, I checked that the index was created:
curl -XGET 'localhost:9200/_cat/indices?v&pretty'
Result:
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open .kibana EaUbEQCETVKQbYThrhPGaA 1 1 1 0 3.6kb 3.6kb
yellow open pdfs Z2SR-ApFR9SYsvY08tgSZw 5 1 1 0 4.6kb 4.6kb
When I try to index the PDF with the following mand, I get an error.
curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
"pdf": @/Users/user/path/to/pdf/file.pdf
}'
Error:
{"error":"Content-Type header [application/pdf] is not supported","status":406}
Is this because my PDF is not Base64 encoded or am I doing something else wrong? I am trying to create a digital library to search through PDFs.
UPDATE:
I encoded my pdf with:
openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file
recreated my index and ran the following mand on the base64_encoded_file:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file
And I got the following error:
Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}
I tried adding the file as a body:
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
Error:
{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transportty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}
Halp
Share Improve this question edited Mar 8, 2018 at 6:30 aeronesto asked Mar 7, 2018 at 5:34 aeronestoaeronesto 911 silver badge8 bronze badges1 Answer
Reset to default 6I found the answer to my problem:
Elasticsearch does not fetch data from source so,
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : @/base64_encoded_file
}'
won't work. The "field" from attachment options (in my example, "pdf") must be data, not a filepath. This thread explains three options for sending [pdf] content to elasticsearch:
- You can extract the content [from the pdf] and just send what you want to index to elasticsearch.
- You can send the binary BASE64 to elasticsearch ingest which will do the extraction
- You can send the binary to FSCrawler which will do the extraction before sending to elasticsearch.
In short, the data passed to elasticsearch must be as defined in the documentation.
curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
{
"pdf" : "base64_encoded_data"
}'