te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>python - Topological sort in Polars - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - Topological sort in Polars - Stack Overflow

programmeradmin3浏览0评论
df = pl.from_repr('''
shape: (6, 2)
┌──────┬──────┐
│ A    ┆ B    │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ null │
│ 2    ┆ 1    │
│ 2    ┆ 2    │
│ null ┆ 3    │
│ 3    ┆ 4    │
│ 4    ┆ null │
│ 5    ┆ 5    │
└──────┴──────┘
''')

I want to sort a dataframe such that multiple columns are in a sorted order, excluding nulls.

In the example above, columns A and B are both sorted, excluding nulls. This feels like a topological sort to me, with the following conditions:

df[0, 'A'] < df[1, 'A']
df[1, 'B'] < df[2, 'B']
df[2, 'B'] < df[3, 'B']
df[3, 'B'] < df[4, 'B']
df[4, 'A'] < df[5, 'A']
df[5, 'A'] < df[6, 'A']

I understand it's not always possible to do a topological sort if there is a cycle, e.g.

df[0, 'A'] < df[1, 'A']
df[0, 'B'] > df[1, 'B']

In that case, I want to specify that ordering for column A should take precedence over column B.

My use case is that I am merging time series data from multiple datasets with some overlapping events, and I want a single dataframe with all events in a chronological order. There are issues with some of the timestamps, so I cannot compare the raw timestamps directly across datasets.

Is something like this possible in polars?

df = pl.from_repr('''
shape: (6, 2)
┌──────┬──────┐
│ A    ┆ B    │
│ ---  ┆ ---  │
│ i64  ┆ i64  │
╞══════╪══════╡
│ 1    ┆ null │
│ 2    ┆ 1    │
│ 2    ┆ 2    │
│ null ┆ 3    │
│ 3    ┆ 4    │
│ 4    ┆ null │
│ 5    ┆ 5    │
└──────┴──────┘
''')

I want to sort a dataframe such that multiple columns are in a sorted order, excluding nulls.

In the example above, columns A and B are both sorted, excluding nulls. This feels like a topological sort to me, with the following conditions:

df[0, 'A'] < df[1, 'A']
df[1, 'B'] < df[2, 'B']
df[2, 'B'] < df[3, 'B']
df[3, 'B'] < df[4, 'B']
df[4, 'A'] < df[5, 'A']
df[5, 'A'] < df[6, 'A']

I understand it's not always possible to do a topological sort if there is a cycle, e.g.

df[0, 'A'] < df[1, 'A']
df[0, 'B'] > df[1, 'B']

In that case, I want to specify that ordering for column A should take precedence over column B.

My use case is that I am merging time series data from multiple datasets with some overlapping events, and I want a single dataframe with all events in a chronological order. There are issues with some of the timestamps, so I cannot compare the raw timestamps directly across datasets.

Is something like this possible in polars?

Share Improve this question edited 2 days ago jqurious 21.4k4 gold badges20 silver badges39 bronze badges asked Feb 18 at 4:45 T.H RiceT.H Rice 3071 gold badge2 silver badges9 bronze badges
Add a comment  | 

1 Answer 1

Reset to default 1

You can specify multiple columns when you call DataFrame.sort (or LazyFrame.sort), but they only support absolute ordering with nulls sent to the start or the end.

You could try to customize your sorting logic using pl.arg_sort_by, col.sort_by and so on, however it will probably be very inefficient compared to the built-in sort method.

Example

expression = (
    pl.col('A')
      # Order by B to fill in nulls inside of A with the preceding value A would have when sorted by B
    .sort_by("B")
    .forward_fill()
     # Sort back into the original order
    .sort_by(pl.col('idx').sort_by("B"))
)

print(df.with_row_index('idx').sort(expression), "B")
shape: (7, 3)
┌─────┬──────┬──────┐
│ idx ┆ A    ┆ B    │
│ --- ┆ ---  ┆ ---  │
│ u32 ┆ i64  ┆ i64  │
╞═════╪══════╪══════╡
│ 0   ┆ 1    ┆ null │
│ 1   ┆ 2    ┆ 1    │
│ 2   ┆ 2    ┆ 2    │
│ 3   ┆ null ┆ 3    │
│ 4   ┆ 3    ┆ 4    │
│ 5   ┆ 4    ┆ null │
│ 6   ┆ 5    ┆ 5    │
└─────┴──────┴──────┘
发布评论

评论列表(0)

  1. 暂无评论