te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>split - Splitting one column to three columns for uneven characters in r - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

split - Splitting one column to three columns for uneven characters in r - Stack Overflow

programmeradmin4浏览0评论

I have tried to split a single column to three columns. But I failed. I have the following data set

> dat
 name
 Jhon Austin B 100kg
 Mick Gray C 110kg
 Tom Jef A 30kg

First I tried to extract last word using following codes

library(tidyr)

   dt<-dat %>% separate(name, into = c('name', 'pack'), sep = -6, convert = TRUE)

I got the following one

name           pack
Jhon Austin B  100kg
Mick Gray C    110kg
Tom Jef        A30kg

Where A was added with 30 kg. Though both should be in separate column. My final result should be like this

name         class   pack
Jhon Austin   B      100kg
Mick Gray     C      110kg
Tom Jef       A      30kg

I will be grateful if anyone helps me. Thanks in advance.

I have tried to split a single column to three columns. But I failed. I have the following data set

> dat
 name
 Jhon Austin B 100kg
 Mick Gray C 110kg
 Tom Jef A 30kg

First I tried to extract last word using following codes

library(tidyr)

   dt<-dat %>% separate(name, into = c('name', 'pack'), sep = -6, convert = TRUE)

I got the following one

name           pack
Jhon Austin B  100kg
Mick Gray C    110kg
Tom Jef        A30kg

Where A was added with 30 kg. Though both should be in separate column. My final result should be like this

name         class   pack
Jhon Austin   B      100kg
Mick Gray     C      110kg
Tom Jef       A      30kg

I will be grateful if anyone helps me. Thanks in advance.

Share Improve this question edited 2 days ago Edward 19.1k3 gold badges16 silver badges35 bronze badges asked 2 days ago RokibRokib 1277 bronze badges 1
  • Related: stackoverflow/questions/4350440/… AND stackoverflow/questions/7069076/… could split first and last name into separate columns then merge back together for full name this way. – Kelly Ireland Commented 2 days ago
Add a comment  | 

5 Answers 5

Reset to default 5
  • Option 1

You could try separate_wider_regex

dat %>%
    separate_wider_regex(
        name,
        patterns = c(name = ".*", " ", class = "\\w", " ", pack = "\\d+kg")
    )
  • Option 2

With base R, you can try sub + read.table

with(
    dat,
    setNames(
        read.table(
            text =
                sub("^(.*)\\s(\\w)\\s(\\d+.*)$", "\\1_\\2_\\3", name),
            sep = "_"
        ),
        c("name", "class", "pack")
    )
)

which gives

# A tibble: 3 × 3
  name        class  pack
  <chr>       <chr> <chr>
1 Jhon Austin B     100kg
2 Mick Gray   C     110kg
3 Tom Jef     A     30kg

data

dat <- data.frame(
    name = c(
        "Jhon Austin B 100kg",
        "Mick Gray C 110kg",
        "Tom Jef A 30kg"
    )
)

We could use the str_extract() function from the stringr library:

library(stringr)

dat$class <- str_extract(dat$name, "\\b[A-Z](?= \\d+\\w+$)")
dat$pack <- str_extract(dat$name, "\\b\\d+\\w+$")
dat$name <- str_extract(dat$name, "\\w+(?: \\w+)(?= [A-Z] \\d+\\w+$)")
dat

         name class  pack
1 Jhon Austin     B 100kg
2   Mick Gray     C 110kg
3     Tom Jef     A  30kg

To care for names with more/less than two parts, we could write a small string reverse helper function rv, then strsplit at spaces and recombine appropriately.

> rv <- \(x) {
+   strsplit(x, '') |> lapply(rev) |> sapply(paste, collapse='')
+ }
> rv(dat$name) |> sapply(strsplit, ' ') |> 
+   lapply(\(x) c(paste(x[-(1:2)], collapse=' '), x[2:1])) |> 
+   lapply(rv) |> do.call(what='rbind') |> `rownames<-`(NULL) |> 
+   as.data.frame() |> type.convert(as.is=TRUE) |> setNames(c('name', 'class', 'pack'))
            name class  pack
1    Jhon Austin     B 100kg
2      Mick Gray     C 110kg
3        Tom Jef     A  30kg
4 John F Kennedy     A  30kg
5            Foo     B  70kg

Data:

> dput(dat)
structure(list(name = c("Jhon Austin B 100kg", "Mick Gray C 110kg", 
"Tom Jef A 30kg", "John F Kennedy A 30kg", "Foo B 70kg")), class = "data.frame", row.names = c(NA, 
-5L))

You could try splitting into both forename and surname then merging back to one:

library(dplyr)
dt <- dat %>% separate(name, into = c('name', 'name2', 'class', 'pack'), sep = " ", convert = TRUE)
dt$name <- paste(dt$name, dt$name2)
# Get rid of name2
dt <- dt[, -2]

Base R, a single strsplit() + trimws(), haven't found the best pattern (yet).

strsplit(xyzzy$name, "(?=\\S+ \\S+$)", perl = TRUE) |>
  unlist() |>
  trimws() |>
  matrix(ncol = 3, byrow = TRUE) |>
  data.frame() |>
  setNames(c('name', 'class', 'pack')) 
         name class  pack
1 Jhon Austin     B 100kg
2   Mick Gray     C 110kg
3     Tom Jef     A  30kg

Approach expects your data is as well anised as given. Do we really want to carry "kg" in pack. Wouldn't it be better to ignore it and make the variable's class numeric?

发布评论

评论列表(0)

  1. 暂无评论