I am stuck with an error with the encoding of non-ascii characters from a FlowFile content, in NiFi. I am processing the text with an ExecuteScript
processor using Jython.
The flow is a simple GenerateFlowFile
with the text set to This is a special character ×.
This processor is connected to the ExecuteScript
processor with the following Jython code:
import csv, json
from .apachemons.io import IOUtils
from java.nio.charset import StandardCharsets
from java.io import BufferedReader, InputStreamReader
from .apache.nifi.processor.io import InputStreamCallback
# Define a subclass of InputStreamCallback for use in session.read()
class PyInputStreamCallback(InputStreamCallback):
_text = '' # Placeholder to return text
def __init__(self):
pass
def getText(self):
return self._text
def process(self, inputStream):
reader = InputStreamReader(inputStream)
#reader = InputStreamReader(inputStream, StandardCharsets.ISO_8859_1)
br = BufferedReader(reader)
try:
line = br.readLine()
i = 0
while line:
self._text += line
line = br.readLine()
i += 1
except Exception as e:
log.warn(str(e))
reader.close()
br.close()
# end class
flowfile = session.get()
if flowfile is not None:
try:
reader = PyInputStreamCallback()
session.read(flowfile, reader)
text = reader.getText()
with open('/path/to/myFile.txt', 'a') as f:
f.write(text+'\n')
#session.remove(flowfile)
session.transfer(flowfile, REL_SUCCESS)
except Exception as e:
session.putAttribute(flowfile, 'scriptError', str(e))
session.transfer(flowfile, REL_FAILURE)
The error from the title is retrieved from the bulletin due to the logging in the line with log.warn(str(e))
.
I need to read the content of the FlowFile and append it to a file in a file system.
I also have an alternative code for the same task, which also yields the same error:
import csv, json
from .apachemons.io import IOUtils
from java.nio.charset import StandardCharsets
from .apache.nifi.processor.io import InputStreamCallback
# Define a subclass of InputStreamCallback for use in session.read()
class PyInputStreamCallback(InputStreamCallback):
def __init__(self):
pass
def process(self, inputStream):
text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
#text = IOUtils.toString(inputStream, StandardCharsets.ISO_8859_1)
with open('/path/to/myFile.txt', 'a') as f:
f.write(text+'\n')
# end class
flowfile = session.get()
if flowfile is not None:
try:
session.read(flowfile, PyInputStreamCallback())
session.remove(flowfile)
except Exception as e:
session.putAttribute(flowfile, 'scriptError', str(e))
session.transfer(flowfile, REL_FAILURE)
With this second code, the error is seen in the attribute named scriptError
.
If I specify the StandardCharset.ISO_8859_1
, like in the following line:
text = IOUtils.toString(inputStream, StandardCharset.ISO_8859_1)
Then the error message changes to:
'ascii' codec can't encode character u'\xd7' in position 28: ordinal not in range(128)
Specifying the Charset seems to detect the correct byte code of the character, but it fails handling it anyway.
Any idea on how to fix that?
I am stuck with an error with the encoding of non-ascii characters from a FlowFile content, in NiFi. I am processing the text with an ExecuteScript
processor using Jython.
The flow is a simple GenerateFlowFile
with the text set to This is a special character ×.
This processor is connected to the ExecuteScript
processor with the following Jython code:
import csv, json
from .apachemons.io import IOUtils
from java.nio.charset import StandardCharsets
from java.io import BufferedReader, InputStreamReader
from .apache.nifi.processor.io import InputStreamCallback
# Define a subclass of InputStreamCallback for use in session.read()
class PyInputStreamCallback(InputStreamCallback):
_text = '' # Placeholder to return text
def __init__(self):
pass
def getText(self):
return self._text
def process(self, inputStream):
reader = InputStreamReader(inputStream)
#reader = InputStreamReader(inputStream, StandardCharsets.ISO_8859_1)
br = BufferedReader(reader)
try:
line = br.readLine()
i = 0
while line:
self._text += line
line = br.readLine()
i += 1
except Exception as e:
log.warn(str(e))
reader.close()
br.close()
# end class
flowfile = session.get()
if flowfile is not None:
try:
reader = PyInputStreamCallback()
session.read(flowfile, reader)
text = reader.getText()
with open('/path/to/myFile.txt', 'a') as f:
f.write(text+'\n')
#session.remove(flowfile)
session.transfer(flowfile, REL_SUCCESS)
except Exception as e:
session.putAttribute(flowfile, 'scriptError', str(e))
session.transfer(flowfile, REL_FAILURE)
The error from the title is retrieved from the bulletin due to the logging in the line with log.warn(str(e))
.
I need to read the content of the FlowFile and append it to a file in a file system.
I also have an alternative code for the same task, which also yields the same error:
import csv, json
from .apachemons.io import IOUtils
from java.nio.charset import StandardCharsets
from .apache.nifi.processor.io import InputStreamCallback
# Define a subclass of InputStreamCallback for use in session.read()
class PyInputStreamCallback(InputStreamCallback):
def __init__(self):
pass
def process(self, inputStream):
text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
#text = IOUtils.toString(inputStream, StandardCharsets.ISO_8859_1)
with open('/path/to/myFile.txt', 'a') as f:
f.write(text+'\n')
# end class
flowfile = session.get()
if flowfile is not None:
try:
session.read(flowfile, PyInputStreamCallback())
session.remove(flowfile)
except Exception as e:
session.putAttribute(flowfile, 'scriptError', str(e))
session.transfer(flowfile, REL_FAILURE)
With this second code, the error is seen in the attribute named scriptError
.
If I specify the StandardCharset.ISO_8859_1
, like in the following line:
text = IOUtils.toString(inputStream, StandardCharset.ISO_8859_1)
Then the error message changes to:
'ascii' codec can't encode character u'\xd7' in position 28: ordinal not in range(128)
Specifying the Charset seems to detect the correct byte code of the character, but it fails handling it anyway.
Any idea on how to fix that?
Share Improve this question asked 12 hours ago alexalex 12 bronze badges 01 Answer
Reset to default 0'ascii' codec can't encode character ...
There is definitely an encoding error.
My guess that in your code you are converting the flowfile bytes into text with "java" function text = reader.getText()
that is actually reading the the InputStream ascii but java always returns strings as unicode text.
and then you are writing the unicode text into a file with "python/jython" function without unicode encoding specification...
I recommend to write into an output file whatever you have in flowfile without transforming the encoding.
also don't use str()
function on unicode strings - the first python unicode example shows you can get 'ascii' codec can't encode character
error:
https://docs.python./2.7/howto/unicode.html#the-unicode-type
Here is a refactored code. I can't test it on my local but it's pretty simple...
from .apachemons.io import IOUtils
from java.nio.file import Paths
flowfile = session.get()
if flowfile is not None:
try:
path = Paths.get("/path/to/myFile.txt")
session.exportTo(flowfile, path, false) # false
session.transfer(flowfile, REL_SUCCESS)
except Exception as e:
flowfile = session.putAttribute(flowfile, 'scriptError', e.toString()) # replaced the python str() with java toString()
session.transfer(flowfile, REL_FAILURE)
links to docs for used functions:
- https://javadoc.io/static/.apache.nifi/nifi-api/2.0.0/.apache.nifi//apache/nifi/processor/ProcessSession.html#exportTo(.apache.nifi.flowfile.FlowFile,java.nio.file.Path,boolean)
- https://docs.oracle/en/java/javase/21/docs/api/java.base/java/nio/file/Paths.html#get(java.lang.String,java.lang.String...)