I am running Apache Druid through Docker.
I am successfully able to load JSON data, HTTPs etc., but I am facing challenges with processing Parquet files. The "Load data" and "Tasks" are completing successfully, and there are no errors in the logs. However, I cannot see the data in 'Datasources'.
Below are the files for:
- Docker-compose
- Environment
- Specification
- Parquet file: Flights from /parquet/sample
I have already tried docker-compose down and docker-compose up.
Please let me know where I might be making a mistake.
--Environment--
# Java tuning
#DRUID_XMX=1g
#DRUID_XMS=1g
#DRUID_MAXNEWSIZE=250m
#DRUID_NEWSIZE=250m
#DRUID_MAXDIRECTMEMORYSIZE=6172m
DRUID_SINGLE_NODE_CONF=micro-quickstart
druid_emitter_logging_logLevel=debug
druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions"]
druid_zk_service_host=zookeeper
druid_metadata_storage_host=
druid_metadata_storage_type=postgresql
druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
druid_metadata_storage_connector_user=druid
druid_metadata_storage_connector_password=FoolishPassword
druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=.apache.logging.log4j.jul.LogManager"]
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB
druid_storage_type=local
druid_storage_storageDirectory=/opt/shared/segments
druid_indexer_logs_type=file
druid_indexer_logs_directory=/opt/shared/indexing-logs
druid_processing_numThreads=2
druid_processing_numMergeBuffers=2
DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name=".apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>
--Specification--
{
"type": "index_parallel",
"spec": {
"dataSchema": {
"dataSource": "flights",
"timestampSpec": {
"column": "FL_DATE",
"format": "millis",
"missingValue": null
},
"dimensionsSpec": {
"dimensions": [
{
"type": "long",
"name": "AIR_TIME",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "double",
"name": "ARR_TIME",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "long",
"name": "DISTANCE",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "double",
"name": "DEP_TIME",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "long",
"name": "DEP_DELAY",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "long",
"name": "ARR_DELAY",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
}
],
"dimensionExclusions": [
"__time",
"FL_DATE"
],
"includeAllDimensions": false,
"useSchemaDiscovery": false
},
"metricsSpec": [],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": {
"type": "none"
},
"rollup": false,
"intervals": []
},
"transformSpec": {
"filter": null,
"transforms": []
}
},
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "local",
"baseDir": "/opt/druid/quickstart/tutorial",
"filter": "flights.parquet"
},
"inputFormat": {
"type": "parquet"
},
"appendToExisting": false,
"dropExisting": false
},
"tuningConfig": {
"type": "index_parallel",
"maxRowsPerSegment": 5000000,
"appendableIndexSpec": {
"type": "onheap",
"preserveExistingMetrics": false
},
"maxRowsInMemory": 1000000,
"maxBytesInMemory": 0,
"skipBytesInMemoryOverheadCheck": false,
"maxTotalRows": null,
"numShards": null,
"splitHintSpec": null,
"partitionsSpec": {
"type": "dynamic",
"maxRowsPerSegment": 5000000,
"maxTotalRows": null
},
"indexSpec": {
"bitmap": {
"type": "roaring"
},
"dimensionCompression": "lz4",
"stringDictionaryEncoding": {
"type": "utf8"
},
"metricCompression": "lz4",
"longEncoding": "longs"
},
"indexSpecForIntermediatePersists": {
"bitmap": {
"type": "roaring"
},
"dimensionCompression": "lz4",
"stringDictionaryEncoding": {
"type": "utf8"
},
"metricCompression": "lz4",
"longEncoding": "longs"
},
"maxPendingPersists": 0,
"forceGuaranteedRollup": false,
"reportParseExceptions": false,
"pushTimeout": 0,
"segmentWriteOutMediumFactory": null,
"maxNumConcurrentSubTasks": 1,
"maxRetry": 3,
"taskStatusCheckPeriodMs": 1000,
"chatHandlerTimeout": "PT10S",
"chatHandlerNumRetries": 5,
"maxNumSegmentsToMerge": 100,
"totalNumMergeTasks": 10,
"logParseExceptions": false,
"maxParseExceptions": 2147483647,
"maxSavedParseExceptions": 0,
"maxColumnsToMerge": -1,
"awaitSegmentAvailabilityTimeoutMillis": 0,
"maxAllowedLockCount": -1,
"numPersistThreads": 1,
"partitionDimensions": []
}
},
"context": {
"forceTimeChunkLock": true,
"useLineageBasedSegmentAllocation": true
}
}
--docker-compose--
version: "2.2"
volumes:
metadata_data: {}
middle_var: {}
historical_var: {}
broker_var: {}
coordinator_var: {}
router_var: {}
druid_shared: {}
services:
postgres:
container_name: postgres
image: postgres:latest
ports:
- "5432:5432"
volumes:
- metadata_data:/var/lib/postgresql/data
environment:
- POSTGRES_PASSWORD=pass*****
- POSTGRES_USER=druid
- POSTGRES_DB=druid
zookeeper:
container_name: zookeeper
image: zookeeper:3.5.10
ports:
- "2181:2181"
environment:
- ZOO_MY_ID=1
coordinator:
image: apache/druid:32.0.0
container_name: coordinator
volumes:
- druid_shared:/opt/shared
- coordinator_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
ports:
- "8081:8081"
command:
- coordinator
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
broker:
image: apache/druid:32.0.0
container_name: broker
volumes:
- broker_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8082:8082"
command:
- broker
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
historical:
image: apache/druid:32.0.0
container_name: historical
volumes:
- druid_shared:/opt/shared
- historical_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8083:8083"
command:
- historical
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
middlemanager:
image: apache/druid:32.0.0
container_name: middlemanager
volumes:
- druid_shared:/opt/shared
- middle_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8091:8091"
- "8100-8105:8100-8105"
command:
- middleManager
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
router:
image: apache/druid:32.0.0
container_name: router
volumes:
- router_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8888:8888"
command:
- router
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
I am running Apache Druid through Docker.
I am successfully able to load JSON data, HTTPs etc., but I am facing challenges with processing Parquet files. The "Load data" and "Tasks" are completing successfully, and there are no errors in the logs. However, I cannot see the data in 'Datasources'.
Below are the files for:
- Docker-compose
- Environment
- Specification
- Parquet file: Flights from https://www.tablab.app/parquet/sample
I have already tried docker-compose down and docker-compose up.
Please let me know where I might be making a mistake.
--Environment--
# Java tuning
#DRUID_XMX=1g
#DRUID_XMS=1g
#DRUID_MAXNEWSIZE=250m
#DRUID_NEWSIZE=250m
#DRUID_MAXDIRECTMEMORYSIZE=6172m
DRUID_SINGLE_NODE_CONF=micro-quickstart
druid_emitter_logging_logLevel=debug
druid_extensions_loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions"]
druid_zk_service_host=zookeeper
druid_metadata_storage_host=
druid_metadata_storage_type=postgresql
druid_metadata_storage_connector_connectURI=jdbc:postgresql://postgres:5432/druid
druid_metadata_storage_connector_user=druid
druid_metadata_storage_connector_password=FoolishPassword
druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=.apache.logging.log4j.jul.LogManager"]
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB
druid_storage_type=local
druid_storage_storageDirectory=/opt/shared/segments
druid_indexer_logs_type=file
druid_indexer_logs_directory=/opt/shared/indexing-logs
druid_processing_numThreads=2
druid_processing_numMergeBuffers=2
DRUID_LOG4J=<?xml version="1.0" encoding="UTF-8" ?><Configuration status="WARN"><Appenders><Console name="Console" target="SYSTEM_OUT"><PatternLayout pattern="%d{ISO8601} %p [%t] %c - %m%n"/></Console></Appenders><Loggers><Root level="info"><AppenderRef ref="Console"/></Root><Logger name=".apache.druid.jetty.RequestLog" additivity="false" level="DEBUG"><AppenderRef ref="Console"/></Logger></Loggers></Configuration>
--Specification--
{
"type": "index_parallel",
"spec": {
"dataSchema": {
"dataSource": "flights",
"timestampSpec": {
"column": "FL_DATE",
"format": "millis",
"missingValue": null
},
"dimensionsSpec": {
"dimensions": [
{
"type": "long",
"name": "AIR_TIME",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "double",
"name": "ARR_TIME",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "long",
"name": "DISTANCE",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "double",
"name": "DEP_TIME",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "long",
"name": "DEP_DELAY",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
},
{
"type": "long",
"name": "ARR_DELAY",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": false
}
],
"dimensionExclusions": [
"__time",
"FL_DATE"
],
"includeAllDimensions": false,
"useSchemaDiscovery": false
},
"metricsSpec": [],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "HOUR",
"queryGranularity": {
"type": "none"
},
"rollup": false,
"intervals": []
},
"transformSpec": {
"filter": null,
"transforms": []
}
},
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "local",
"baseDir": "/opt/druid/quickstart/tutorial",
"filter": "flights.parquet"
},
"inputFormat": {
"type": "parquet"
},
"appendToExisting": false,
"dropExisting": false
},
"tuningConfig": {
"type": "index_parallel",
"maxRowsPerSegment": 5000000,
"appendableIndexSpec": {
"type": "onheap",
"preserveExistingMetrics": false
},
"maxRowsInMemory": 1000000,
"maxBytesInMemory": 0,
"skipBytesInMemoryOverheadCheck": false,
"maxTotalRows": null,
"numShards": null,
"splitHintSpec": null,
"partitionsSpec": {
"type": "dynamic",
"maxRowsPerSegment": 5000000,
"maxTotalRows": null
},
"indexSpec": {
"bitmap": {
"type": "roaring"
},
"dimensionCompression": "lz4",
"stringDictionaryEncoding": {
"type": "utf8"
},
"metricCompression": "lz4",
"longEncoding": "longs"
},
"indexSpecForIntermediatePersists": {
"bitmap": {
"type": "roaring"
},
"dimensionCompression": "lz4",
"stringDictionaryEncoding": {
"type": "utf8"
},
"metricCompression": "lz4",
"longEncoding": "longs"
},
"maxPendingPersists": 0,
"forceGuaranteedRollup": false,
"reportParseExceptions": false,
"pushTimeout": 0,
"segmentWriteOutMediumFactory": null,
"maxNumConcurrentSubTasks": 1,
"maxRetry": 3,
"taskStatusCheckPeriodMs": 1000,
"chatHandlerTimeout": "PT10S",
"chatHandlerNumRetries": 5,
"maxNumSegmentsToMerge": 100,
"totalNumMergeTasks": 10,
"logParseExceptions": false,
"maxParseExceptions": 2147483647,
"maxSavedParseExceptions": 0,
"maxColumnsToMerge": -1,
"awaitSegmentAvailabilityTimeoutMillis": 0,
"maxAllowedLockCount": -1,
"numPersistThreads": 1,
"partitionDimensions": []
}
},
"context": {
"forceTimeChunkLock": true,
"useLineageBasedSegmentAllocation": true
}
}
--docker-compose--
version: "2.2"
volumes:
metadata_data: {}
middle_var: {}
historical_var: {}
broker_var: {}
coordinator_var: {}
router_var: {}
druid_shared: {}
services:
postgres:
container_name: postgres
image: postgres:latest
ports:
- "5432:5432"
volumes:
- metadata_data:/var/lib/postgresql/data
environment:
- POSTGRES_PASSWORD=pass*****
- POSTGRES_USER=druid
- POSTGRES_DB=druid
zookeeper:
container_name: zookeeper
image: zookeeper:3.5.10
ports:
- "2181:2181"
environment:
- ZOO_MY_ID=1
coordinator:
image: apache/druid:32.0.0
container_name: coordinator
volumes:
- druid_shared:/opt/shared
- coordinator_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
ports:
- "8081:8081"
command:
- coordinator
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
broker:
image: apache/druid:32.0.0
container_name: broker
volumes:
- broker_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8082:8082"
command:
- broker
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
historical:
image: apache/druid:32.0.0
container_name: historical
volumes:
- druid_shared:/opt/shared
- historical_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8083:8083"
command:
- historical
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
middlemanager:
image: apache/druid:32.0.0
container_name: middlemanager
volumes:
- druid_shared:/opt/shared
- middle_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8091:8091"
- "8100-8105:8100-8105"
command:
- middleManager
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
router:
image: apache/druid:32.0.0
container_name: router
volumes:
- router_var:/opt/druid/var
depends_on:
- zookeeper
- postgres
- coordinator
ports:
- "8888:8888"
command:
- router
env_file:
- environment
environment:
- druid.extensions.loadList=["druid-histogram", "druid-datasketches", "druid-lookups-cached-global", "postgresql-metadata-storage", "druid-multi-stage-query", "druid-parquet-extensions", "druid-avro-extensions"]
Thanks in advance. Regards, Chaitanya
Blockquote
Share Improve this question asked 2 days ago chaitanya kumar Dondapatichaitanya kumar Dondapati 556 bronze badges1 Answer
Reset to default 0Some deeper checks for you.
When ingestion tasks complete, they write to deep storage - if I read your Docker + config correctly, that is writing to /opt/shared/segments
. My first check would be to look in that mount to check that the segments have been created. If you do not see this, check the logs for the tasks themselves - which I believe you have configured to go to /opt/shared/indexing-logs
.
Secondly the Overlord writes to the metadata database (postgres), cataloguing what has been ingested. Does your POSTGRES_PASSWORD
in compose match druid_metadata_storage_connector_password
in the environment file? To dig into this deeper, check the logs for the Overlord and MiddleManagers.
If you would like to see a working example (which you may wish to adapt) may I recommend looking at https://github/implydata/learn-druid, where you will also find a series of Python notebooks with examples.
Incidentally, should you only be running this locally for learning and demonstration, you may prefer to just use bin/start-druid
.
I also see you're using index_parallel
- though I haven't tried this myself directly, might I suggest you try to use SQL-based ingestion if you haven't already, since this uses the well-established MSQ engine that improves data layout.