最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

amazon web services - Getting the error when creating the redshift table from GLUE - Stack Overflow

programmeradmin2浏览0评论

The error output in the log is:

Failed to create table: An error occurred while calling o105.getSink. : java.lang.RuntimeException: Temporary directory for redshift not specified. Please verify --TempDir argument parameters.

I have added --TempDir as s3://prateekproject1/temp_dir/ in job parameter but still getting the error

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame

## @params: [JOB_NAME]
job_name = os.environ.get('AWS_GLUE_JOB_NAME', 'testing1')
args = getResolvedOptions(sys.argv, ['JOB_NAME','TempDir'])
temp_dir = args['TempDir']
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

jobmit()
#temp_dir = "s3://prateekproject1/temp_dir/"
redshift_url = "jdbc:redshift://flighttest.ckbw5q2qccz6.eu-north1.redshift.amazonaws:5439/dev"
redshift_user = "awsuser"
redshift_password = "Prateek1997"
redshift_table = "dev.flight.details"
create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {redshift_table}(
    Carrier varchar(100),
    OriginAirportID INT,
    DestAirportID   INT,
    DepDelay    INT,
    ArrDelay    INT
    )
    DISTSTYLE KEY
    DISTKEY(OriginAirportID);
 """
 
spark._jvm.java.lang.Class.forName("com.amazon.redshift.jdbc42.Driver")
try:
    glueContext.write_dynamic_frame.from_options(        frame=DynamicFrame.fromDF(spark.sql("SELECT 1 AS dummy"), glueContext, "dummy"),
        connection_type="redshift",
        connection_options={
            "url": redshift_url,
            "user": redshift_user,
            "password": redshift_password,
            "preactions": create_table_sql,  # Executes the CREATE TABLE query before loading data
            "dbtable": "dev.flight_details",  # The table name
            "TempDir": temp_dir  # Temporary directory for Redshift interactions
        }
    )
    print("Table created succesfully")
except Exception as e:
    print("Failed to create table:", str(e))
    
database_name = "flight"  # Replace with the database name
table_name = "prateekproject1"
df = glueContext.create_dynamic_frame.from_catalog(
    database = database_name,
    table_name = table_name
    )
df.printSchema()
dataframe = df.toDF()
print("Sample Data:")
dataframe.show(10)
jobmit()````

The error output in the log is:

Failed to create table: An error occurred while calling o105.getSink. : java.lang.RuntimeException: Temporary directory for redshift not specified. Please verify --TempDir argument parameters.

I have added --TempDir as s3://prateekproject1/temp_dir/ in job parameter but still getting the error

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame

## @params: [JOB_NAME]
job_name = os.environ.get('AWS_GLUE_JOB_NAME', 'testing1')
args = getResolvedOptions(sys.argv, ['JOB_NAME','TempDir'])
temp_dir = args['TempDir']
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

jobmit()
#temp_dir = "s3://prateekproject1/temp_dir/"
redshift_url = "jdbc:redshift://flighttest.ckbw5q2qccz6.eu-north1.redshift.amazonaws:5439/dev"
redshift_user = "awsuser"
redshift_password = "Prateek1997"
redshift_table = "dev.flight.details"
create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {redshift_table}(
    Carrier varchar(100),
    OriginAirportID INT,
    DestAirportID   INT,
    DepDelay    INT,
    ArrDelay    INT
    )
    DISTSTYLE KEY
    DISTKEY(OriginAirportID);
 """
 
spark._jvm.java.lang.Class.forName("com.amazon.redshift.jdbc42.Driver")
try:
    glueContext.write_dynamic_frame.from_options(        frame=DynamicFrame.fromDF(spark.sql("SELECT 1 AS dummy"), glueContext, "dummy"),
        connection_type="redshift",
        connection_options={
            "url": redshift_url,
            "user": redshift_user,
            "password": redshift_password,
            "preactions": create_table_sql,  # Executes the CREATE TABLE query before loading data
            "dbtable": "dev.flight_details",  # The table name
            "TempDir": temp_dir  # Temporary directory for Redshift interactions
        }
    )
    print("Table created succesfully")
except Exception as e:
    print("Failed to create table:", str(e))
    
database_name = "flight"  # Replace with the database name
table_name = "prateekproject1"
df = glueContext.create_dynamic_frame.from_catalog(
    database = database_name,
    table_name = table_name
    )
df.printSchema()
dataframe = df.toDF()
print("Sample Data:")
dataframe.show(10)
jobmit()````
Share Improve this question asked Nov 21, 2024 at 8:24 Prateek GoelPrateek Goel 191 bronze badge
Add a comment  | 

1 Answer 1

Reset to default 0

Use redshiftTmpDir property instead of TempDir:

glueContext.write_dynamic_frame.from_options(        frame=DynamicFrame.fromDF(spark.sql("SELECT 1 AS dummy"), glueContext, "dummy"),
    connection_type="redshift",
    connection_options={
        "url": redshift_url,
        "user": redshift_user,
        "password": redshift_password,
        "preactions": create_table_sql,  # Executes the CREATE TABLE query before loading data
        "dbtable": "dev.flight_details",  # The table name
        "redshiftTmpDir": temp_dir  # Temporary directory for Redshift interactions
    }
)
发布评论

评论列表(0)

  1. 暂无评论