The error output in the log is:
Failed to create table: An error occurred while calling o105.getSink. : java.lang.RuntimeException: Temporary directory for redshift not specified. Please verify --TempDir argument parameters.
I have added --TempDir as s3://prateekproject1/temp_dir/ in job parameter but still getting the error
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
## @params: [JOB_NAME]
job_name = os.environ.get('AWS_GLUE_JOB_NAME', 'testing1')
args = getResolvedOptions(sys.argv, ['JOB_NAME','TempDir'])
temp_dir = args['TempDir']
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
jobmit()
#temp_dir = "s3://prateekproject1/temp_dir/"
redshift_url = "jdbc:redshift://flighttest.ckbw5q2qccz6.eu-north1.redshift.amazonaws:5439/dev"
redshift_user = "awsuser"
redshift_password = "Prateek1997"
redshift_table = "dev.flight.details"
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {redshift_table}(
Carrier varchar(100),
OriginAirportID INT,
DestAirportID INT,
DepDelay INT,
ArrDelay INT
)
DISTSTYLE KEY
DISTKEY(OriginAirportID);
"""
spark._jvm.java.lang.Class.forName("com.amazon.redshift.jdbc42.Driver")
try:
glueContext.write_dynamic_frame.from_options( frame=DynamicFrame.fromDF(spark.sql("SELECT 1 AS dummy"), glueContext, "dummy"),
connection_type="redshift",
connection_options={
"url": redshift_url,
"user": redshift_user,
"password": redshift_password,
"preactions": create_table_sql, # Executes the CREATE TABLE query before loading data
"dbtable": "dev.flight_details", # The table name
"TempDir": temp_dir # Temporary directory for Redshift interactions
}
)
print("Table created succesfully")
except Exception as e:
print("Failed to create table:", str(e))
database_name = "flight" # Replace with the database name
table_name = "prateekproject1"
df = glueContext.create_dynamic_frame.from_catalog(
database = database_name,
table_name = table_name
)
df.printSchema()
dataframe = df.toDF()
print("Sample Data:")
dataframe.show(10)
jobmit()````
The error output in the log is:
Failed to create table: An error occurred while calling o105.getSink. : java.lang.RuntimeException: Temporary directory for redshift not specified. Please verify --TempDir argument parameters.
I have added --TempDir as s3://prateekproject1/temp_dir/ in job parameter but still getting the error
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
## @params: [JOB_NAME]
job_name = os.environ.get('AWS_GLUE_JOB_NAME', 'testing1')
args = getResolvedOptions(sys.argv, ['JOB_NAME','TempDir'])
temp_dir = args['TempDir']
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
jobmit()
#temp_dir = "s3://prateekproject1/temp_dir/"
redshift_url = "jdbc:redshift://flighttest.ckbw5q2qccz6.eu-north1.redshift.amazonaws:5439/dev"
redshift_user = "awsuser"
redshift_password = "Prateek1997"
redshift_table = "dev.flight.details"
create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {redshift_table}(
Carrier varchar(100),
OriginAirportID INT,
DestAirportID INT,
DepDelay INT,
ArrDelay INT
)
DISTSTYLE KEY
DISTKEY(OriginAirportID);
"""
spark._jvm.java.lang.Class.forName("com.amazon.redshift.jdbc42.Driver")
try:
glueContext.write_dynamic_frame.from_options( frame=DynamicFrame.fromDF(spark.sql("SELECT 1 AS dummy"), glueContext, "dummy"),
connection_type="redshift",
connection_options={
"url": redshift_url,
"user": redshift_user,
"password": redshift_password,
"preactions": create_table_sql, # Executes the CREATE TABLE query before loading data
"dbtable": "dev.flight_details", # The table name
"TempDir": temp_dir # Temporary directory for Redshift interactions
}
)
print("Table created succesfully")
except Exception as e:
print("Failed to create table:", str(e))
database_name = "flight" # Replace with the database name
table_name = "prateekproject1"
df = glueContext.create_dynamic_frame.from_catalog(
database = database_name,
table_name = table_name
)
df.printSchema()
dataframe = df.toDF()
print("Sample Data:")
dataframe.show(10)
jobmit()````
Share
Improve this question
asked Nov 21, 2024 at 8:24
Prateek GoelPrateek Goel
191 bronze badge
1 Answer
Reset to default 0Use redshiftTmpDir property instead of TempDir:
glueContext.write_dynamic_frame.from_options( frame=DynamicFrame.fromDF(spark.sql("SELECT 1 AS dummy"), glueContext, "dummy"),
connection_type="redshift",
connection_options={
"url": redshift_url,
"user": redshift_user,
"password": redshift_password,
"preactions": create_table_sql, # Executes the CREATE TABLE query before loading data
"dbtable": "dev.flight_details", # The table name
"redshiftTmpDir": temp_dir # Temporary directory for Redshift interactions
}
)