I'm following delta lake liquid clustering document, and got an error [PARSE_SYNTAX_ERROR] Syntax error at or near 'CLUSTER'.
. As I understood, the following code should work since it is supported by delta lake 3.3.
Versions
This is the python library version I'm using.
- pyspark == 3.5.4
- delta-lake == 3.3.0
How to reproduce
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, TimestampType, IntegerType, ShortType
REQUIRED_JARS = [
"io.delta:delta-spark_2.12:3.3.0",
"io.delta:delta-storage:3.3.0",
]
spark = SparkSession.builder \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "64g") \
.config("spark.log.level", "ERROR") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.jars.packages", ",".join(REQUIRED_JARS)) \
.getOrCreate()
DeltaTable.createOrReplace() \
.tableName("SAMPLE") \
.addColumn("CLUSTER_TYPE", dataType = StringType()) \
.addColumn("TIMESTAMP", dataType = TimestampType()) \
.addColumn("CLUSTER_NAME", dataType = StringType()) \
.clusterBy("DATA_TIMESTAMP", "CLUSTER_TYPE") \
.execute()
spark.sql("ALTER TABLE SAMPLE CLUSTER BY ('CLUSTER_TYPE')");
Error
---------------------------------------------------------------------------
ParseException Traceback (most recent call last)
Cell In[11], line 2
1 spark.sql(f"ALTER TABLE SAMPLE SET TBLPROPERTIES ('delta.enableDeletionVectors' = false);")
----> 2 spark.sql(f"""ALTER TABLE SAMPLE
3 CLUSTER BY ('CLUSTER_TYPE');""")
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/pyspark/sql/session.py:1631, in SparkSession.sql(self, sqlQuery, args, **kwargs)
1627 assert self._jvm is not None
1628 litArgs = self._jvm.PythonUtils.toArray(
1629 [_to_java_column(lit(v)) for v in (args or [])]
1630 )
-> 1631 return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
1632 finally:
1633 if len(kwargs) > 0:
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 selfmand_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/pyspark/errors/exceptions/captured.py:185, in capture_sql_exception.<locals>.deco(*a, **kw)
181 converted = convert_exception(e.java_exception)
182 if not isinstance(converted, UnknownException):
183 # Hide where the exception came from that shows a non-Pythonic
184 # JVM exception message.
--> 185 raise converted from None
186 else:
187 raise
ParseException:
[PARSE_SYNTAX_ERROR] Syntax error at or near 'CLUSTER'.(line 2, pos 0)
== SQL ==
ALTER TABLE SAMPLE
CLUSTER BY ('CLUSTER_TYPE');
^^^
I'm following delta lake liquid clustering document, and got an error [PARSE_SYNTAX_ERROR] Syntax error at or near 'CLUSTER'.
. As I understood, the following code should work since it is supported by delta lake 3.3.
Versions
This is the python library version I'm using.
- pyspark == 3.5.4
- delta-lake == 3.3.0
How to reproduce
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, TimestampType, IntegerType, ShortType
REQUIRED_JARS = [
"io.delta:delta-spark_2.12:3.3.0",
"io.delta:delta-storage:3.3.0",
]
spark = SparkSession.builder \
.config("spark.driver.memory", "16g") \
.config("spark.executor.memory", "64g") \
.config("spark.log.level", "ERROR") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.jars.packages", ",".join(REQUIRED_JARS)) \
.getOrCreate()
DeltaTable.createOrReplace() \
.tableName("SAMPLE") \
.addColumn("CLUSTER_TYPE", dataType = StringType()) \
.addColumn("TIMESTAMP", dataType = TimestampType()) \
.addColumn("CLUSTER_NAME", dataType = StringType()) \
.clusterBy("DATA_TIMESTAMP", "CLUSTER_TYPE") \
.execute()
spark.sql("ALTER TABLE SAMPLE CLUSTER BY ('CLUSTER_TYPE')");
Error
---------------------------------------------------------------------------
ParseException Traceback (most recent call last)
Cell In[11], line 2
1 spark.sql(f"ALTER TABLE SAMPLE SET TBLPROPERTIES ('delta.enableDeletionVectors' = false);")
----> 2 spark.sql(f"""ALTER TABLE SAMPLE
3 CLUSTER BY ('CLUSTER_TYPE');""")
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/pyspark/sql/session.py:1631, in SparkSession.sql(self, sqlQuery, args, **kwargs)
1627 assert self._jvm is not None
1628 litArgs = self._jvm.PythonUtils.toArray(
1629 [_to_java_column(lit(v)) for v in (args or [])]
1630 )
-> 1631 return DataFrame(self._jsparkSession.sql(sqlQuery, litArgs), self)
1632 finally:
1633 if len(kwargs) > 0:
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/py4j/java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File ~/.local/share/hatch/env/virtual/delta-lake-playground/xMOlQu7O/notebook/lib/python3.12/site-packages/pyspark/errors/exceptions/captured.py:185, in capture_sql_exception.<locals>.deco(*a, **kw)
181 converted = convert_exception(e.java_exception)
182 if not isinstance(converted, UnknownException):
183 # Hide where the exception came from that shows a non-Pythonic
184 # JVM exception message.
--> 185 raise converted from None
186 else:
187 raise
ParseException:
[PARSE_SYNTAX_ERROR] Syntax error at or near 'CLUSTER'.(line 2, pos 0)
== SQL ==
ALTER TABLE SAMPLE
CLUSTER BY ('CLUSTER_TYPE');
^^^
Share
Improve this question
edited Feb 7 at 4:32
Ryan Byoun
asked Feb 6 at 6:16
Ryan ByounRyan Byoun
751 silver badge7 bronze badges
3
- post full error message you're getting. – Kashyap Commented Feb 7 at 0:38
- @Kashyap I added the full error message – Ryan Byoun Commented Feb 7 at 4:32
- @RyanByoun have you tried ALTER TABLE SAMPLE CLUSTER BY (CLUSTER_TYPE) – mjeday Commented Feb 7 at 9:39
1 Answer
Reset to default 1Try deleting the quotes and use full path if possible
spark.sql("ALTER TABLE catalog.db.SAMPLE CLUSTER BY (CLUSTER_TYPE) ");