最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

c# - Extract structured data from Git log output using Superpower - Stack Overflow

programmeradmin2浏览0评论

I'm trying to extract structured data from a git log output which looks like this:

sha:"1ac31eadbe9cdf4d365de68b24a5daa2ab9c2575" refs:[HEAD->release/1.6.0;tag:release/1.6.0/api;origin/release/1.6.0] notes:[];
sha:"95c8adff0ec6a3064aa03f395e7cce63dd7cb21b" refs:[] notes:[];
sha:"2877f582fe1a4cdbf339d3183fca3da2e496b90b" refs:[] notes:[];
sha:"fbcd95c938e42c360b4ad9227ee05475e5308dd8" refs:[tag:release/1.5.0/api;tag:release/1.5.0/worker;tag:release/1.5.0/docs;origin/release/1.5.0] notes:[utc20241017065043 release/1.5.0/api

utc20250401123529 release/1.5.0/worker

utc20250401172120 release/1.5.0/docs
];
sha:"d520de3061cc71ab887b9dd924d8e432e4673127" refs:[] notes:[];
sha:"9f869c42e1209108fc061d383e48ae00e6dba21d" refs:[tag:1.0.0;tag:1.0-preview;origin/1.0.0] notes:[utc20250101163814 1.0.0
];
sha:"0b41aaa2a72b3029d6878bb827a03dc99188d1c5" refs:[] notes:[];
sha:"12b8220819a71b8a02c5ab542b4fae81d9644112" refs:[origin/experiment-123] notes:[];
sha:"3de7d9b9ed2dcf28a1aaa6db79239c55ce0ef48e" refs:[] notes:[];

(Some entries contain multi-line notes and multiple refs).

The Git command looks like this:

git log --pretty='tformat:sha:"%H" refs:[%(decorate:pointer=->,prefix=,suffix=,tag=tag:,separator=;)] notes:[%N];' --notes=custom-notes

I've tried adapting Superpower's JsonParser sample code to this task:

using System.Linq;
using Superpower;
using Superpower.Display;
using Superpower.Model;
using Superpower.Parsers;
using Superpower.Tokenizers;

enum GitLogToken
{
    [Token(Example = "[")]
    LSquareBracket,

    [Token(Example = "]")]
    RSquareBracket,

    [Token(Example = ",")]
    Comma,

    [Token(Example = ":")]
    Colon,

    [Token(Example = ";")]
    SemiColon,

    String,

    KeyPrefix,
}

static class GitLogTokenizer
{
    static TextParser<Unit> GitLogStringToken { get; } =
        from open in Character.EqualTo('"')
        from content in Character.Except('"').Value(Unit.Value)
            .IgnoreMany()
        from close in Character.EqualTo('"')
        select Unit.Value;

    public static Tokenizer<GitLogToken> Instance { get; } =
        new TokenizerBuilder<GitLogToken>()
            .Ignore(Span.WhiteSpace)
            .Match(Character.EqualTo(','), GitLogToken.Comma)
            .Match(Character.EqualTo(':'), GitLogToken.Colon)
            .Match(Character.EqualTo(';'), GitLogToken.SemiColon)
            .Match(Character.EqualTo('['), GitLogToken.LSquareBracket)
            .Match(Character.EqualTo(']'), GitLogToken.RSquareBracket)
            .Match(GitLogStringToken, GitLogToken.String)
            // Identifier.CStyle was supposed to tokenize `sha`, `refs` and `notes` prefixes. Not sure.
            .Match(Identifier.CStyle, GitLogToken.KeyPrefix, requireDelimiters: false)
            .Build();
}

static class GitLogTextParsers
{
    public static TextParser<string> String { get; } =
        from open in Character.EqualTo('"')
        from chars in Character.ExceptIn('"')
            .Many()
        from close in Character.EqualTo('"')
        select new string(chars);
}

static class GitLogParser
{
    static TokenListParser<GitLogToken, object> GitLogString { get; } =
        Token.EqualTo(GitLogToken.String)
            .Apply(GitLogTextParsers.String)
            .Select(s => (object)s);

    static TokenListParser<GitLogToken, object> GitLogArray { get; } =
        from begin in Token.EqualTo(GitLogToken.LSquareBracket)
        from values in Parse.Ref(() => GitLogEntry!)
            .ManyDelimitedBy(Token.EqualTo(GitLogToken.Comma),
                end: Token.EqualTo(GitLogToken.RSquareBracket))
        select (object)values;

    static TokenListParser<GitLogToken, object?> GitLogEntry { get; } =
        GitLogString.AsNullable()
            .Or(GitLogArray.AsNullable())
            // TODO: Each entry ends with semi-colon ';'
            .Named("GitLog value");

    // TODO: Each GitLogEntry can have array of notes. Notes are separated by newlines.

    // TODO: Each GitLogEntry can have array of refs. Refs are separated by semi-colons.

    // TODO: Multiple GitLogEntry items inside "log document".

    static TokenListParser<GitLogToken, object?> GitLogDocument { get; } = GitLogEntry.AtEnd();

    public static bool TryParse(string gitLog, out object? value, out string? error, out Position errorPosition)
    {
        var tokens = GitLogTokenizer.Instance.TryTokenize(gitLog);
        if (!tokens.HasValue) {
            value = null;
            error = tokens.ToString();
            errorPosition = tokens.ErrorPosition;
            return false;
        }

        var parsed = GitLogDocument.TryParse(tokens.Value);
        if (!parsed.HasValue) {
            value = null;
            error = parsed.ToString();
            errorPosition = parsed.ErrorPosition;
            return false;
        }

        value = parsed.Value;
        error = null;
        errorPosition = Position.Empty;
        return true;
    }
}

I don't know where to go from here.

Can anyone direct me to more suitable Superpower examples for these kinds of inputs (i.e. logs or textual data with ad hoc markup)?

发布评论

评论列表(0)

  1. 暂无评论