I'm trying to extract structured data from a git log
output which looks like this:
sha:"1ac31eadbe9cdf4d365de68b24a5daa2ab9c2575" refs:[HEAD->release/1.6.0;tag:release/1.6.0/api;origin/release/1.6.0] notes:[];
sha:"95c8adff0ec6a3064aa03f395e7cce63dd7cb21b" refs:[] notes:[];
sha:"2877f582fe1a4cdbf339d3183fca3da2e496b90b" refs:[] notes:[];
sha:"fbcd95c938e42c360b4ad9227ee05475e5308dd8" refs:[tag:release/1.5.0/api;tag:release/1.5.0/worker;tag:release/1.5.0/docs;origin/release/1.5.0] notes:[utc20241017065043 release/1.5.0/api
utc20250401123529 release/1.5.0/worker
utc20250401172120 release/1.5.0/docs
];
sha:"d520de3061cc71ab887b9dd924d8e432e4673127" refs:[] notes:[];
sha:"9f869c42e1209108fc061d383e48ae00e6dba21d" refs:[tag:1.0.0;tag:1.0-preview;origin/1.0.0] notes:[utc20250101163814 1.0.0
];
sha:"0b41aaa2a72b3029d6878bb827a03dc99188d1c5" refs:[] notes:[];
sha:"12b8220819a71b8a02c5ab542b4fae81d9644112" refs:[origin/experiment-123] notes:[];
sha:"3de7d9b9ed2dcf28a1aaa6db79239c55ce0ef48e" refs:[] notes:[];
(Some entries contain multi-line notes and multiple refs).
The Git command looks like this:
git log --pretty='tformat:sha:"%H" refs:[%(decorate:pointer=->,prefix=,suffix=,tag=tag:,separator=;)] notes:[%N];' --notes=custom-notes
I've tried adapting Superpower's JsonParser sample code to this task:
using System.Linq;
using Superpower;
using Superpower.Display;
using Superpower.Model;
using Superpower.Parsers;
using Superpower.Tokenizers;
enum GitLogToken
{
[Token(Example = "[")]
LSquareBracket,
[Token(Example = "]")]
RSquareBracket,
[Token(Example = ",")]
Comma,
[Token(Example = ":")]
Colon,
[Token(Example = ";")]
SemiColon,
String,
KeyPrefix,
}
static class GitLogTokenizer
{
static TextParser<Unit> GitLogStringToken { get; } =
from open in Character.EqualTo('"')
from content in Character.Except('"').Value(Unit.Value)
.IgnoreMany()
from close in Character.EqualTo('"')
select Unit.Value;
public static Tokenizer<GitLogToken> Instance { get; } =
new TokenizerBuilder<GitLogToken>()
.Ignore(Span.WhiteSpace)
.Match(Character.EqualTo(','), GitLogToken.Comma)
.Match(Character.EqualTo(':'), GitLogToken.Colon)
.Match(Character.EqualTo(';'), GitLogToken.SemiColon)
.Match(Character.EqualTo('['), GitLogToken.LSquareBracket)
.Match(Character.EqualTo(']'), GitLogToken.RSquareBracket)
.Match(GitLogStringToken, GitLogToken.String)
// Identifier.CStyle was supposed to tokenize `sha`, `refs` and `notes` prefixes. Not sure.
.Match(Identifier.CStyle, GitLogToken.KeyPrefix, requireDelimiters: false)
.Build();
}
static class GitLogTextParsers
{
public static TextParser<string> String { get; } =
from open in Character.EqualTo('"')
from chars in Character.ExceptIn('"')
.Many()
from close in Character.EqualTo('"')
select new string(chars);
}
static class GitLogParser
{
static TokenListParser<GitLogToken, object> GitLogString { get; } =
Token.EqualTo(GitLogToken.String)
.Apply(GitLogTextParsers.String)
.Select(s => (object)s);
static TokenListParser<GitLogToken, object> GitLogArray { get; } =
from begin in Token.EqualTo(GitLogToken.LSquareBracket)
from values in Parse.Ref(() => GitLogEntry!)
.ManyDelimitedBy(Token.EqualTo(GitLogToken.Comma),
end: Token.EqualTo(GitLogToken.RSquareBracket))
select (object)values;
static TokenListParser<GitLogToken, object?> GitLogEntry { get; } =
GitLogString.AsNullable()
.Or(GitLogArray.AsNullable())
// TODO: Each entry ends with semi-colon ';'
.Named("GitLog value");
// TODO: Each GitLogEntry can have array of notes. Notes are separated by newlines.
// TODO: Each GitLogEntry can have array of refs. Refs are separated by semi-colons.
// TODO: Multiple GitLogEntry items inside "log document".
static TokenListParser<GitLogToken, object?> GitLogDocument { get; } = GitLogEntry.AtEnd();
public static bool TryParse(string gitLog, out object? value, out string? error, out Position errorPosition)
{
var tokens = GitLogTokenizer.Instance.TryTokenize(gitLog);
if (!tokens.HasValue) {
value = null;
error = tokens.ToString();
errorPosition = tokens.ErrorPosition;
return false;
}
var parsed = GitLogDocument.TryParse(tokens.Value);
if (!parsed.HasValue) {
value = null;
error = parsed.ToString();
errorPosition = parsed.ErrorPosition;
return false;
}
value = parsed.Value;
error = null;
errorPosition = Position.Empty;
return true;
}
}
I don't know where to go from here.
Can anyone direct me to more suitable Superpower examples for these kinds of inputs (i.e. logs or textual data with ad hoc markup)?