How to index a PDF in Elasticsearch 6.1 with ingest-attachment plugin & JavaScript Client?

I tried following the instructions in the answer given to the following question:

How to index a pdf file in Elasticsearch 5.0.0 with ingest-attachment plugin?

I couldn't find many examples of the JavaScript client for ElasticSearch, so here is what I have:

Create index

// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});

// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id', 
    body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});

Define Index Mapping going in Node:

var body = {
    pdf:{
        properties:{
            title : {"type" : "keyword", "index" : "false"},
            type  : {"type" : "keyword", "index" : "false"},
            "attachment.pdf" : {"type" : "keyword"}
        }
    }
}

client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})

Define Ingest Pipeline in Node cluster with PUT API

function addPipeline(){
  client.ingest.putPipeline({
    id: 'my-pipeline-id',
    body: {
      "description" : "parse pdfs and index into ES",
      "processors" : [
        { "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
        { "remove" : { "field" : "pdf" } }
      ]
    }
  })
  .then(function () {
     console.log("putPipeline Resolved");
   })
  .catch(function (error) {
     console.log("putPipeline error: " + error);
   });
};

Before I try to upload a PDF, I checked that the index was created:

curl -XGET 'localhost:9200/_cat/indices?v&pretty'

Result:

health status index   uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   .kibana EaUbEQCETVKQbYThrhPGaA   1   1          1            0      3.6kb          3.6kb
yellow open   pdfs    Z2SR-ApFR9SYsvY08tgSZw   5   1          1            0      4.6kb          4.6kb

When I try to index the PDF with the following mand, I get an error.

curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
    "pdf": @/Users/user/path/to/pdf/file.pdf
}'

Error:

{"error":"Content-Type header [application/pdf] is not supported","status":406}

Is this because my PDF is not Base64 encoded or am I doing something else wrong? I am trying to create a digital library to search through PDFs.

UPDATE:

I encoded my pdf with:

openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file

recreated my index and ran the following mand on the base64_encoded_file:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file

And I got the following error:

Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}

I tried adding the file as a body:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
        {
          "pdf" : @/base64_encoded_file
        }'

Error:

{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transportty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}

Halp

I tried following the instructions in the answer given to the following question:

How to index a pdf file in Elasticsearch 5.0.0 with ingest-attachment plugin?

I couldn't find many examples of the JavaScript client for ElasticSearch, so here is what I have:

Create index

// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});

// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id', 
    body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});

Define Index Mapping going in Node:

var body = {
    pdf:{
        properties:{
            title : {"type" : "keyword", "index" : "false"},
            type  : {"type" : "keyword", "index" : "false"},
            "attachment.pdf" : {"type" : "keyword"}
        }
    }
}

client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})

Define Ingest Pipeline in Node cluster with PUT API

function addPipeline(){
  client.ingest.putPipeline({
    id: 'my-pipeline-id',
    body: {
      "description" : "parse pdfs and index into ES",
      "processors" : [
        { "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
        { "remove" : { "field" : "pdf" } }
      ]
    }
  })
  .then(function () {
     console.log("putPipeline Resolved");
   })
  .catch(function (error) {
     console.log("putPipeline error: " + error);
   });
};

Before I try to upload a PDF, I checked that the index was created:

curl -XGET 'localhost:9200/_cat/indices?v&pretty'

Result:

health status index   uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   .kibana EaUbEQCETVKQbYThrhPGaA   1   1          1            0      3.6kb          3.6kb
yellow open   pdfs    Z2SR-ApFR9SYsvY08tgSZw   5   1          1            0      4.6kb          4.6kb

When I try to index the PDF with the following mand, I get an error.

curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
    "pdf": @/Users/user/path/to/pdf/file.pdf
}'

Error:

{"error":"Content-Type header [application/pdf] is not supported","status":406}

Is this because my PDF is not Base64 encoded or am I doing something else wrong? I am trying to create a digital library to search through PDFs.

UPDATE:

I encoded my pdf with:

openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file

recreated my index and ran the following mand on the base64_encoded_file:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file

And I got the following error:

Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}

I tried adding the file as a body:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
        {
          "pdf" : @/base64_encoded_file
        }'

Error:

{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transportty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}

Halp

Share Improve this question edited Mar 8, 2018 at 6:30 asked Mar 7, 2018 at 5:34 aeronesto 911 silver badge8 bronze badges

Add a ment |

1 Answer 1

Sorted by: Reset to default 6

I found the answer to my problem:

Elasticsearch does not fetch data from source so,

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
        {
          "pdf" : @/base64_encoded_file
        }'

won't work. The "field" from attachment options (in my example, "pdf") must be data, not a filepath. This thread explains three options for sending [pdf] content to elasticsearch:

You can extract the content [from the pdf] and just send what you want to index to elasticsearch.
You can send the binary BASE64 to elasticsearch ingest which will do the extraction
You can send the binary to FSCrawler which will do the extraction before sending to elasticsearch.

In short, the data passed to elasticsearch must be as defined in the documentation.

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
    {
        "pdf" : "base64_encoded_data"
    }'

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

How to index a PDF in Elasticsearch 6.1 with ingest-attachment plugin & JavaScript Client? - Stack Overflow

UPDATE:

UPDATE:

1 Answer 1

与本文相关的文章

评论列表(0)