javascript - Split a CSV string by line skipping newlines contained between quotes

If the following regex can split a csv string by line.

var lines = csv.split(/\r|\r?\n/g);

How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?

Example:

2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",

If you don't see it, here's a version with the newlines visible:

2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n

The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.

Update:

I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.

Here's the code for validating and parsing an entry (ie one line):

$.csvEntry2Array = function(csv, meta) {
  var meta = (meta !== undefined ? meta : {});
  var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
  var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;

  // build the CSV validator regex
  var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
  reValid = RegExp(reValid.source.replace(/S/g, separator));
  reValid = RegExp(reValid.source.replace(/D/g, delimiter));

  // build the CSV line parser regex
  var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
  reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
  reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');

  // Return NULL if input string is not well formed CSV string.
  if (!reValid.test(csv)) {
    return null;
  }

  // "Walk" the string using replace with callback.
  var output = [];
  csv.replace(reValue, function(m0, m1, m2) {
    // Remove backslash from any delimiters in the value
    if (m1 !== undefined) {
      var reDelimiterUnescape = /\\D/g;              
      reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
      output.push(m1.replace(reDelimiterUnescape, delimiter));
    } else if (m2 !== undefined) { 
      output.push(m2);
    }
    return '';
  });

  // Handle special case of empty last value.
  var reEmptyLast = /S\s*$/;
  reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
  if (reEmptyLast.test(csv)) {
    output.push('');
  }

  return output;
};

Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.

This is the code that does the split-by-line part:

$.csv2Array = function(csv, meta) {
  var meta = (meta !== undefined ? meta : {});
  var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
  var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
  var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;

  // process by line
  var lines = csv.split(/\r\n|\r|\n/g);
  var output = [];
  for(var i in lines) {
    if(i < skip) {
      continue;
    }
    // process each value
    var line = $.csvEntry2Array(lines[i], {
      delimiter: delimiter,
      separator: separator
    });
    output.push(line);
  }

  return output;
};

For a breakdown on how that reges works take a look at this answer. Mine is a slightly adapted version. I consolidated the single and double quote matching to match just one text delimiter and made the delimiter/separators dynamic. It does a great job of validating entiries but the line-splitting solution I added on top is pretty frail and breaks on the edge case I described above.

I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.

Update:

splitLines: function(csv, delimiter) {
  var state = 0;
  var value = "";
  var line = "";
  var lines = [];
  function endOfRow() {
    lines.push(value);
    value = "";
    state = 0;
  };
  csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
    switch (state) {
      // the start of an entry
      case 0:
        if (m0 === "\"") {
          state = 1;
        } else if (m0 === "\n") {
          endOfRow();
        } else if (/^\r$/.test(m0)) {
          // carriage returns are ignored
        } else {
          value += m0;
          state = 3;
        }
        break;
      // delimited input  
      case 1:
        if (m0 === "\"") {
          state = 2;
        } else {
          value += m0;
          state = 1;
        }
        break;
      // delimiter found in delimited input
      case 2:
        // is the delimiter escaped?
        if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
          value += m0;
          state = 1;
        } else if (m0 === ",") {
          value += m0;
          state = 0;
        } else if (m0 === "\n") {
          endOfRow();
        } else if (m0 === "\r") {
          // Ignore
        } else {
          throw new Error("Illegal state");
        }
        break;
      // un-delimited input
      case 3:
        if (m0 === ",") {
          value += m0;
          state = 0;
        } else if (m0 === "\"") {
          throw new Error("Unquoted delimiter found");
        } else if (m0 === "\n") {
          endOfRow();
        } else if (m0 === "\r") {
          // Ignore
        } else {
          throw new Error("Illegal data");
        }
          break;
      default:
        throw new Error("Unknown state");
    }
    return "";
  });
  if (state != 0) {
    endOfRow();
  }
  return lines;
}

All it took is 4 states for a line splitter:

0: the start of an entry
1: the following is quoted
2: a second quote has been encountered
3: the following isn't quoted

It's almost a plete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.

Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a plete parser to a line-splitter.

Update:

Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 pliant.

If the following regex can split a csv string by line.

var lines = csv.split(/\r|\r?\n/g);

How could this be adapted to skip newline chars that are contained within a CSV value (Ie between quotes/double-quotes)?

Example:

2,"Evans & Sutherland","230-132-111AA",,"Visual","P
CB",,1,"Offsite",

If you don't see it, here's a version with the newlines visible:

2,"Evans & Sutherland","230-132-111AA",,"Visual","P\r\nCB",,1,"Offsite",\r\n

The part I'm trying to skip over is the newline contained in the middle of the "PCB" entry.

Update:

I probably should've mentioned this before but this is a part of a dedicated CSV parsing library called jquery-csv. To provide a better context I have added the current parser implementation below.

Here's the code for validating and parsing an entry (ie one line):

$.csvEntry2Array = function(csv, meta) {
  var meta = (meta !== undefined ? meta : {});
  var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
  var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;

  // build the CSV validator regex
  var reValid = /^\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*(?:S\s*(?:D[^D\\]*(?:\\[\S\s][^D\\]*)*D|[^SD\s\\]*(?:\s+[^SD\s\\]+)*)\s*)*$/;
  reValid = RegExp(reValid.source.replace(/S/g, separator));
  reValid = RegExp(reValid.source.replace(/D/g, delimiter));

  // build the CSV line parser regex
  var reValue = /(?!\s*$)\s*(?:D([^D\\]*(?:\\[\S\s][^D\\]*)*)D|([^SD\s\\]*(?:\s+[^SD\s\\]+)*))\s*(?:S|$)/g;
  reValue = RegExp(reValue.source.replace(/S/g, separator), 'g');
  reValue = RegExp(reValue.source.replace(/D/g, delimiter), 'g');

  // Return NULL if input string is not well formed CSV string.
  if (!reValid.test(csv)) {
    return null;
  }

  // "Walk" the string using replace with callback.
  var output = [];
  csv.replace(reValue, function(m0, m1, m2) {
    // Remove backslash from any delimiters in the value
    if (m1 !== undefined) {
      var reDelimiterUnescape = /\\D/g;              
      reDelimiterUnescape = RegExp(reDelimiterUnescape.source.replace(/D/, delimiter), 'g');
      output.push(m1.replace(reDelimiterUnescape, delimiter));
    } else if (m2 !== undefined) { 
      output.push(m2);
    }
    return '';
  });

  // Handle special case of empty last value.
  var reEmptyLast = /S\s*$/;
  reEmptyLast = RegExp(reEmptyLast.source.replace(/S/, separator));
  if (reEmptyLast.test(csv)) {
    output.push('');
  }

  return output;
};

Note: I haven't tested yet but I think I could probably incorporate the last match into the main split/callback.

This is the code that does the split-by-line part:

$.csv2Array = function(csv, meta) {
  var meta = (meta !== undefined ? meta : {});
  var separator = 'separator' in meta ? meta.separator : $.csvDefaults.separator;
  var delimiter = 'delimiter' in meta ? meta.delimiter : $.csvDefaults.delimiter;
  var skip = 'skip' in meta ? meta.skip : $.csvDefaults.skip;

  // process by line
  var lines = csv.split(/\r\n|\r|\n/g);
  var output = [];
  for(var i in lines) {
    if(i < skip) {
      continue;
    }
    // process each value
    var line = $.csvEntry2Array(lines[i], {
      delimiter: delimiter,
      separator: separator
    });
    output.push(line);
  }

  return output;
};

I'm just looking for a solution that walks the string extracting valid entries (to pass on to the entry parser) or fails on bad data returning an error indicating the line the parsing failed on.

Update:

splitLines: function(csv, delimiter) {
  var state = 0;
  var value = "";
  var line = "";
  var lines = [];
  function endOfRow() {
    lines.push(value);
    value = "";
    state = 0;
  };
  csv.replace(/(\"|,|\n|\r|[^\",\r\n]+)/gm, function (m0){
    switch (state) {
      // the start of an entry
      case 0:
        if (m0 === "\"") {
          state = 1;
        } else if (m0 === "\n") {
          endOfRow();
        } else if (/^\r$/.test(m0)) {
          // carriage returns are ignored
        } else {
          value += m0;
          state = 3;
        }
        break;
      // delimited input  
      case 1:
        if (m0 === "\"") {
          state = 2;
        } else {
          value += m0;
          state = 1;
        }
        break;
      // delimiter found in delimited input
      case 2:
        // is the delimiter escaped?
        if (m0 === "\"" && value.substr(value.length - 1) === "\"") {
          value += m0;
          state = 1;
        } else if (m0 === ",") {
          value += m0;
          state = 0;
        } else if (m0 === "\n") {
          endOfRow();
        } else if (m0 === "\r") {
          // Ignore
        } else {
          throw new Error("Illegal state");
        }
        break;
      // un-delimited input
      case 3:
        if (m0 === ",") {
          value += m0;
          state = 0;
        } else if (m0 === "\"") {
          throw new Error("Unquoted delimiter found");
        } else if (m0 === "\n") {
          endOfRow();
        } else if (m0 === "\r") {
          // Ignore
        } else {
          throw new Error("Illegal data");
        }
          break;
      default:
        throw new Error("Unknown state");
    }
    return "";
  });
  if (state != 0) {
    endOfRow();
  }
  return lines;
}

All it took is 4 states for a line splitter:

0: the start of an entry
1: the following is quoted
2: a second quote has been encountered
3: the following isn't quoted

It's almost a plete parser. For my use case, I just wanted a line splitter so I could provide a more granual approach to processing CSV data.

Note: Credit for this approach goes to another dev whom I won't name publicly without his permission. All I did was adapt it from a plete parser to a line-splitter.

Update:

Discovered a few broken edge cases in the previous lineSplitter implementation. The one provided should be fully RFC 4180 pliant.

Share edited Oct 7, 2021 at 6:01 CommunityBot 11 silver badge asked May 2, 2012 at 4:25 Evan Plaice 14.2k6 gold badges78 silver badges94 bronze badges

You simply cannot do this using regex. You can create a regex that handles some or even most conditions. But there will always be some valid CSV which will not work with the regex. – James Anderson Commented May 2, 2012 at 4:41
Regex is for detection, not parsing. As you scan across the text, if you have to "remember" anything besides the characters (i.e. i'm "inside" a quoted literal), you are parsing it. Subtle difference, which is why everyone wants to use it to parse. – Jeff Meatball Yang Commented May 2, 2012 at 5:14

Add a ment |

3 Answers 3

Sorted by: Reset to default 2

As I have noted in a ment there is no plete solution just using single regex.

A novel method using several regexps by splitting on ma and joining back strings with embedded mas is described here:-

Personally I would use a simple finite state machine as described here

The state machine has more code, but the code is cleaner and its clear what each piece of code is doing. Longer term this will be much more reliable and maintainable.

It's not a good idea to use regex's to parse. Better to use it to detect the "bad" splits and then merge them back:

var lines = csv.split(/\r?\n/g);
var bad = [];

for(var i=lines.length-1; i> 0; i--) {
    // find all the unescaped quotes on the line:
    var m = lines[i].match(/[^\\]?\"/g);

    // if there are an odd number of them, this line, and the line after it is bad:
    if((m ? m.length : 0) % 2 == 1) { bad.push(i--); }
}

// starting at the bottom of the list, merge lines back, using \r\n
for(var b=0,len=bad.length; b < len; b++) {
    lines.splice(bad[b]-1, 2, lines[bad[b]-1]+"\r\n"+lines[bad[b]]);
}

_{(This answer is licensed under both CC0 and WTFPL.)}

Be careful- That newline is PART of that value. It's not PCB, it's P\nCB.

However, why can't you just use string.split(',')? If need be, you can run through the list and cast to ints or remove the padded quotation marks.

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

javascript - Split a CSV string by line skipping newlines contained between quotes - Stack Overflow

3 Answers 3

与本文相关的文章

评论列表(0)