te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>r - cleaner way to match observations using IDs in different columns - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

r - cleaner way to match observations using IDs in different columns - Stack Overflow

programmeradmin2浏览0评论

I am using mark-recapture data, with each individual having a PIT tag (Tag) and a genetic sample ID given at the first observation (ID), which is typically used as the ID for the individual (Unified.ID). However, sometimes an animal will lose a PIT tag, and we have to retag it, but if we know its old tag number we record that as Alt_tag.

This sample data is four observations from the same individual, so I am trying to get them all to have the same Unified.ID by matching the Alt_tag in row 4 to the Tag in rows 1:3, so that all 4 rows end up with "CAL163037" in the Unified.ID column.

sample.dat<-tibble(ID=c("CAL163037",NA,NA,NA),
           Tag = c("132800","132800","132800","981930"),
           Date = c("2016-08-23","2017-06-09","2017-06-22","2017-08-23"),
           Alt_tag = c(NA,NA,NA,"132800"),
           Unified.ID = c("CAL163037","CAL163037","CAL163037","981930"))

# A tibble: 4 × 5
  ID        Tag    Date       Alt_tag Unified.ID
  <chr>     <chr>  <chr>      <chr>   <chr>     
1 CAL163037 132800 2016-08-23 NA      CAL163037 
2 NA        132800 2017-06-09 NA      CAL163037 
3 NA        132800 2017-06-22 NA      CAL163037 
4 NA        981930 2017-08-23 132800  981930  

I think I sort of have something close with the code below, but I feel like there is a more elegant way to do this, preferably with some kind of dplyr mutate, so it is easier to check that it is doing what I want it to (I can't get this if_else function working inside of mutate()). My full dataset is pretty big, so I want to be confident that it is working properly across the whole thing. Any ideas of how to clean this up?

> if_else(is.na(sample.dat$Alt_tag)==FALSE,(sample.dat[
which(sample.dat$Tag%in%sample.dat$Alt_tag),"Unified.ID"]%>%distinct()),NA)

# A tibble: 4 × 1
# Groups:   Unified.ID [1]
  Unified.ID
  <fct>     
1 CAL163037 
2 CAL163037 
3 CAL163037 
4 CAL163037 

I am using mark-recapture data, with each individual having a PIT tag (Tag) and a genetic sample ID given at the first observation (ID), which is typically used as the ID for the individual (Unified.ID). However, sometimes an animal will lose a PIT tag, and we have to retag it, but if we know its old tag number we record that as Alt_tag.

This sample data is four observations from the same individual, so I am trying to get them all to have the same Unified.ID by matching the Alt_tag in row 4 to the Tag in rows 1:3, so that all 4 rows end up with "CAL163037" in the Unified.ID column.

sample.dat<-tibble(ID=c("CAL163037",NA,NA,NA),
           Tag = c("132800","132800","132800","981930"),
           Date = c("2016-08-23","2017-06-09","2017-06-22","2017-08-23"),
           Alt_tag = c(NA,NA,NA,"132800"),
           Unified.ID = c("CAL163037","CAL163037","CAL163037","981930"))

# A tibble: 4 × 5
  ID        Tag    Date       Alt_tag Unified.ID
  <chr>     <chr>  <chr>      <chr>   <chr>     
1 CAL163037 132800 2016-08-23 NA      CAL163037 
2 NA        132800 2017-06-09 NA      CAL163037 
3 NA        132800 2017-06-22 NA      CAL163037 
4 NA        981930 2017-08-23 132800  981930  

I think I sort of have something close with the code below, but I feel like there is a more elegant way to do this, preferably with some kind of dplyr mutate, so it is easier to check that it is doing what I want it to (I can't get this if_else function working inside of mutate()). My full dataset is pretty big, so I want to be confident that it is working properly across the whole thing. Any ideas of how to clean this up?

> if_else(is.na(sample.dat$Alt_tag)==FALSE,(sample.dat[
which(sample.dat$Tag%in%sample.dat$Alt_tag),"Unified.ID"]%>%distinct()),NA)

# A tibble: 4 × 1
# Groups:   Unified.ID [1]
  Unified.ID
  <fct>     
1 CAL163037 
2 CAL163037 
3 CAL163037 
4 CAL163037 
Share Improve this question asked Feb 17 at 12:22 ghainesghaines 836 bronze badges 3
  • 1 What is "clean"? Base R? – Friede Commented Feb 17 at 13:19
  • 1 One way to clean up your code is to replace is.na(sample.dat$Alt_tag)==FALSE with !is.na(sample.dat$Alt_tag) – jpsmith Commented Feb 17 at 14:30
  • @Friede what I mean is that the above solution returns a vector which has to replace the original vector, and it also has a lot of nested functions that make it difficult to read (%in% within which() within an index before distinct() within if_else()), it's very roundabout and inefficient. – ghaines Commented Feb 17 at 15:07
Add a comment  | 

2 Answers 2

Reset to default 2

Probably you can try this?

sample.dat %>%
    mutate(Unified.Tag = coalesce(Alt_tag, Tag)) %>%
    mutate(Unified.ID = unique(na.omit(ID)), .by = Unified.Tag) %>%
    select(-Unified.Tag)

which gives

# A tibble: 4 × 5
  ID        Tag    Date       Alt_tag Unified.ID
  <chr>     <chr>  <chr>      <chr>   <chr>
1 CAL163037 132800 2016-08-23 NA      CAL163037
2 NA        132800 2017-06-09 NA      CAL163037
3 NA        132800 2017-06-22 NA      CAL163037
4 NA        981930 2017-08-23 132800  CAL163037

Check if the Alt_tag matches a Tag. If it does use the row number as returned by match to index into Unified.ID; otherwise, match will return NA so use the current row_number() to so index. Note that coalesce returns its first non-NA argument.

library(dplyr)

sample.dat %>%
  mutate(Unified.ID = Unified.ID[coalesce(match(Alt_tag, Tag), row_number())])

giving

# A tibble: 4 × 5
  ID        Tag    Date       Alt_tag Unified.ID
  <chr>     <chr>  <chr>      <chr>   <chr>     
1 CAL163037 132800 2016-08-23 <NA>    CAL163037 
2 <NA>      132800 2017-06-09 <NA>    CAL163037 
3 <NA>      132800 2017-06-22 <NA>    CAL163037 
4 <NA>      981930 2017-08-23 132800  CAL163037 

Note that with the data in the question it runs quite fast:

library(microbenchmark)\

microbenchmark(
  A = sample.dat %>%
    mutate(Unified.ID = Unified.ID[coalesce(match(Alt_tag, Tag), row_number())]),
  B = sample.dat %>%
    mutate(Unified.Tag = coalesce(Alt_tag, Tag)) %>%
    mutate(Unified.ID = unique(na.omit(ID)), .by = Unified.Tag) %>%
    select(-Unified.Tag)
)

## Unit: milliseconds
##  expr    min      lq     mean  median       uq     max neval cld
##     A 2.9443 3.12035 3.418708 3.22425  3.79555  4.5942   100  a 
##     B 8.8230 9.19820 9.997262 9.48225 10.99610 13.7240   100   b
发布评论

评论列表(0)

  1. 暂无评论