te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>python - Passing a polars struct to a user-defind function using map_batches - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - Passing a polars struct to a user-defind function using map_batches - Stack Overflow

programmeradmin2浏览0评论

I need to pass a variable number of columns to a user-defined function. The docs mention to first create a pl.struct and subsequently let the function extract it. Here's the example given on the website:

# Add two arrays together:
@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
def add(arr, arr2, result):
    for i in range(len(arr)):
        result[i] = arr[i] + arr2[i]


df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})

out = df3.select(
    # Create a struct that has two columns in it:
    pl.struct(["values1", "values2"])
    # Pass the struct to a lambda that then passes the individual columns to
    # the add() function:
    .map_batches(
        lambda combined: add(
            combined.struct.field("values1"), combined.struct.field("values2")
        )
    )
    .alias("add_columns")
)
print(out)

Now, in my case, I don't know upfront how many columns will enter the pl.struct. Think of using a selector like pl.struct(cs.float()). In my user-defined function, I need to operate on a np.array. That is, the user-defined function will have one input argument that takes the whole array. How can I then extract it within the user-defined function?

EDIT: The output of my user-defined function will be an array that has the exact same shape as the input array. This array needs to be appended to the existing dataframe on axis 1 (new columns).

EDIT: Using pl.concat_arr might be one way to attack my concrete issue. My use case would be along the following lines:

def multiply_by_two(arr):
    # In reality, there are some complex array operations.
    return arr * 2


df = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})

out = df.select(
    # Create an array consisting of two columns:
    pl.concat_arr(["values1", "values2"])
    .map_batches(lambda arr: multiply_by_two(arr))
    .alias("result")
)

The new computed column result holds an array that has the same shape as the input array. I need to unnest the array (something like pl.struct.unnest()). The headings should be the original headings suffixed by "result" (values1_result and values2_result).

Also, I would like to make use of @guvectorize to speed things up.

I need to pass a variable number of columns to a user-defined function. The docs mention to first create a pl.struct and subsequently let the function extract it. Here's the example given on the website:

# Add two arrays together:
@guvectorize([(int64[:], int64[:], float64[:])], "(n),(n)->(n)")
def add(arr, arr2, result):
    for i in range(len(arr)):
        result[i] = arr[i] + arr2[i]


df3 = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})

out = df3.select(
    # Create a struct that has two columns in it:
    pl.struct(["values1", "values2"])
    # Pass the struct to a lambda that then passes the individual columns to
    # the add() function:
    .map_batches(
        lambda combined: add(
            combined.struct.field("values1"), combined.struct.field("values2")
        )
    )
    .alias("add_columns")
)
print(out)

Now, in my case, I don't know upfront how many columns will enter the pl.struct. Think of using a selector like pl.struct(cs.float()). In my user-defined function, I need to operate on a np.array. That is, the user-defined function will have one input argument that takes the whole array. How can I then extract it within the user-defined function?

EDIT: The output of my user-defined function will be an array that has the exact same shape as the input array. This array needs to be appended to the existing dataframe on axis 1 (new columns).

EDIT: Using pl.concat_arr might be one way to attack my concrete issue. My use case would be along the following lines:

def multiply_by_two(arr):
    # In reality, there are some complex array operations.
    return arr * 2


df = pl.DataFrame({"values1": [1, 2, 3], "values2": [10, 20, 30]})

out = df.select(
    # Create an array consisting of two columns:
    pl.concat_arr(["values1", "values2"])
    .map_batches(lambda arr: multiply_by_two(arr))
    .alias("result")
)

The new computed column result holds an array that has the same shape as the input array. I need to unnest the array (something like pl.struct.unnest()). The headings should be the original headings suffixed by "result" (values1_result and values2_result).

Also, I would like to make use of @guvectorize to speed things up.

Share Improve this question edited 2 days ago Andi asked 2 days ago AndiAndi 4,8755 gold badges33 silver badges63 bronze badges 6
  • Have you considered using this? – user459872 Commented 2 days ago
  • Inside map_batches you have a Series, if you want a numpy array you can just call s.to_numpy() - but it may make more sense to use pl.concat_arr instead of a struct for your use case? Perhaps you give a code example that is closer to the actual task. – jqurious Commented 2 days ago
  • @jqurious I added another example. – Andi Commented 2 days ago
  • arr is a pl.Series, you can just use arr.to_numpy() if you want a numpy array - right? – jqurious Commented yesterday
  • If you are talking about arr.to_numpy() within multiply_by_two, then you're correct. Having said that, I think this is not going to work in conjunction with guvectorize. I am also struggling when performing map_batches over a group_by. – Andi Commented yesterday
 |  Show 1 more comment

1 Answer 1

Reset to default 0

A few things, if you use .to_numpy on either an array or a struct, it seems to return the same np.array so the difference in which to choose comes down to memory efficiency and features. The elements of an Array aren't named and you want the output names to correspond to the input columns so that means you probably want a struct. I'm not sure what the memory implications are between the two. I know that going from columns to a struct is cheaper than going from columns to Array but intuitively it seems that columns->struct->np.array ought to be about the same as columns->array->np.array.

Anyway, with that said, here's how to do it:

def multiply_by_two(arr: pl.Series)->pl.Series:
    # capture names of input
    names = arr.struct.fields
    arrnp=arr.to_numpy()
    res = arrnp * 2
    return pl.Series(res).arr.to_struct(fields=[f"{name}_result" for name in names])

df.with_columns(
    # Create an array consisting of two columns:
    pl.struct(["values1", "values2"])
    .map_batches(lambda arr: multiply_by_two(arr))
    .alias("result")
).unnest("result")

shape: (3, 4)
┌─────────┬─────────┬────────────────┬────────────────┐
│ values1 ┆ values2 ┆ values1_result ┆ values2_result │
│ ---     ┆ ---     ┆ ---            ┆ ---            │
│ i64     ┆ i64     ┆ i64            ┆ i64            │
╞═════════╪═════════╪════════════════╪════════════════╡
│ 1       ┆ 10      ┆ 2              ┆ 20             │
│ 2       ┆ 20      ┆ 4              ┆ 40             │
│ 3       ┆ 30      ┆ 6              ┆ 60             │
└─────────┴─────────┴────────────────┴────────────────┘

You can't unnest from within the .with_columns you have to do it at the DataFrame level.

As for combining the above with numba, it should be relatively the same. Just search for polars and numba to find other questions/answers where the two are used together. If you can make a more specific question specifically about their interaction then ask away.

发布评论

评论列表(0)

  1. 暂无评论