最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

How to index a PDF in Elasticsearch 6.1 with ingest-attachment plugin & JavaScript Client? - Stack Overflow

programmeradmin1浏览0评论

I tried following the instructions in the answer given to the following question:

How to index a pdf file in Elasticsearch 5.0.0 with ingest-attachment plugin?

I couldn't find many examples of the JavaScript client for ElasticSearch, so here is what I have:

Create index

// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});

// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id', 
    body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});

Define Index Mapping going in Node:

var body = {
    pdf:{
        properties:{
            title : {"type" : "keyword", "index" : "false"},
            type  : {"type" : "keyword", "index" : "false"},
            "attachment.pdf" : {"type" : "keyword"}
        }
    }
}

client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})

Define Ingest Pipeline in Node cluster with PUT API

function addPipeline(){
  client.ingest.putPipeline({
    id: 'my-pipeline-id',
    body: {
      "description" : "parse pdfs and index into ES",
      "processors" : [
        { "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
        { "remove" : { "field" : "pdf" } }
      ]
    }
  })
  .then(function () {
     console.log("putPipeline Resolved");
   })
  .catch(function (error) {
     console.log("putPipeline error: " + error);
   });
};

Before I try to upload a PDF, I checked that the index was created:

curl -XGET 'localhost:9200/_cat/indices?v&pretty'

Result:

health status index   uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   .kibana EaUbEQCETVKQbYThrhPGaA   1   1          1            0      3.6kb          3.6kb
yellow open   pdfs    Z2SR-ApFR9SYsvY08tgSZw   5   1          1            0      4.6kb          4.6kb

When I try to index the PDF with the following mand, I get an error.

curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
    "pdf": @/Users/user/path/to/pdf/file.pdf
}'

Error:

{"error":"Content-Type header [application/pdf] is not supported","status":406}

Is this because my PDF is not Base64 encoded or am I doing something else wrong? I am trying to create a digital library to search through PDFs.

UPDATE:

I encoded my pdf with:

openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file

recreated my index and ran the following mand on the base64_encoded_file:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file

And I got the following error:

Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}

I tried adding the file as a body:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
        {
          "pdf" : @/base64_encoded_file
        }'

Error:

{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transportty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}

Halp

I tried following the instructions in the answer given to the following question:

How to index a pdf file in Elasticsearch 5.0.0 with ingest-attachment plugin?

I couldn't find many examples of the JavaScript client for ElasticSearch, so here is what I have:

Create index

// elasticsearch Client
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({hosts: [ 'http://localhost:9200/']});

// Create index
client.create({index: 'pdfs', type: 'pdf', id: 'my-index-id', 
    body: {description: 'Test pdf indexing'}
})
.then(function () {console.log("Index created");})
.catch(function (error) {console.log(error);});

Define Index Mapping going in Node:

var body = {
    pdf:{
        properties:{
            title : {"type" : "keyword", "index" : "false"},
            type  : {"type" : "keyword", "index" : "false"},
            "attachment.pdf" : {"type" : "keyword"}
        }
    }
}

client.indices.putMapping({index:"pdfs", type:"pdf", body:body})
.then((response) => {addPipeline()})
.catch((error) => {console.log("putMapping error: " + error)})

Define Ingest Pipeline in Node cluster with PUT API

function addPipeline(){
  client.ingest.putPipeline({
    id: 'my-pipeline-id',
    body: {
      "description" : "parse pdfs and index into ES",
      "processors" : [
        { "attachment" : { "field" : "pdf", "indexed_chars" : -1 } },
        { "remove" : { "field" : "pdf" } }
      ]
    }
  })
  .then(function () {
     console.log("putPipeline Resolved");
   })
  .catch(function (error) {
     console.log("putPipeline error: " + error);
   });
};

Before I try to upload a PDF, I checked that the index was created:

curl -XGET 'localhost:9200/_cat/indices?v&pretty'

Result:

health status index   uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   .kibana EaUbEQCETVKQbYThrhPGaA   1   1          1            0      3.6kb          3.6kb
yellow open   pdfs    Z2SR-ApFR9SYsvY08tgSZw   5   1          1            0      4.6kb          4.6kb

When I try to index the PDF with the following mand, I get an error.

curl -H 'Content-Type: application/pdf' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d'
{
    "pdf": @/Users/user/path/to/pdf/file.pdf
}'

Error:

{"error":"Content-Type header [application/pdf] is not supported","status":406}

Is this because my PDF is not Base64 encoded or am I doing something else wrong? I am trying to create a digital library to search through PDFs.

UPDATE:

I encoded my pdf with:

openssl base64 -in /Users/user/path/to/pdf/file.pdf -out base64_encoded_file

recreated my index and ran the following mand on the base64_encoded_file:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d @/base64_encoded_file

And I got the following error:

Warning: Couldn't read data from file "/base64_encoded_file", this makes an empty POST.
{"error":{"root_cause":[{"type":"parse_exception","reason":"request body is required"}],"type":"parse_exception","reason":"request body is required"},"status":400}

I tried adding the file as a body:

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
        {
          "pdf" : @/base64_encoded_file
        }'

Error:

{"error":{"root_cause":[{"type":"parse_exception","reason":"Failed to parse content to map"}],"type":"parse_exception","reason":"Failed to parse content to map","caused_by":{"type":"json_parse_exception","reason":"Unexpected character ('@' (code 64)): expected a valid value (number, String, array, object, 'true', 'false' or 'null')\n at [Source: org.elasticsearch.transportty4.ByteBufStreamInput@6db5a3dc; line: 3, column: 16]"}},"status":400}

Halp

Share Improve this question edited Mar 8, 2018 at 6:30 aeronesto asked Mar 7, 2018 at 5:34 aeronestoaeronesto 911 silver badge8 bronze badges
Add a ment  | 

1 Answer 1

Reset to default 6

I found the answer to my problem:

Elasticsearch does not fetch data from source so,

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
        {
          "pdf" : @/base64_encoded_file
        }'

won't work. The "field" from attachment options (in my example, "pdf") must be data, not a filepath. This thread explains three options for sending [pdf] content to elasticsearch:

  1. You can extract the content [from the pdf] and just send what you want to index to elasticsearch.
  2. You can send the binary BASE64 to elasticsearch ingest which will do the extraction
  3. You can send the binary to FSCrawler which will do the extraction before sending to elasticsearch.

In short, the data passed to elasticsearch must be as defined in the documentation.

curl -H 'Content-Type: application/json' -XPUT 'localhost:9200/my_index/my_type/id?pipeline=my-pipeline-id' -d '
    {
        "pdf" : "base64_encoded_data"
    }'
发布评论

评论列表(0)

  1. 暂无评论