te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>r - webscrape table using rvest - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

r - webscrape table using rvest - Stack Overflow

programmeradmin3浏览0评论

I am attempting to scrape the table on this page using rvest /?competition=111&round=27&season=2024

This is what I have tried so far

library(rvest)

page <- read_html("/?competition=111&round=27&season=2024")

contentnodes <-page %>% html_nodes ("div.l-content.pre-quench") %>% 
  html_attr("q-data") %>% jsonlite::fromJSON()

but it gives me the following error

Error: lexical error: invalid char in json text.
                                   NA
                 (right here) ------^

Could someone point out what I have done wrong please

I am attempting to scrape the table on this page using rvest https://www.nrl/ladder/?competition=111&round=27&season=2024

This is what I have tried so far

library(rvest)

page <- read_html("https://www.nrl/ladder/?competition=111&round=27&season=2024")

contentnodes <-page %>% html_nodes ("div.l-content.pre-quench") %>% 
  html_attr("q-data") %>% jsonlite::fromJSON()

but it gives me the following error

Error: lexical error: invalid char in json text.
                                   NA
                 (right here) ------^

Could someone point out what I have done wrong please

Share Improve this question edited Feb 18 at 3:30 Edward 19.1k3 gold badges16 silver badges35 bronze badges asked Feb 18 at 0:16 HowGoodisDataHowGoodisData 254 bronze badges
Add a comment  | 

1 Answer 1

Reset to default 1

Many pages fill tables programmatically after load, making static HTML scraping a little harder. This is the case here.

One way to work through this is to use read_html_live instead. It helps to look at the table in a browser and see that the table's id="ladder-table", and use "#" in a CSS-selector way:

page <- rvest::read_html_live("https://www.nrl/ladder/?competition=111&round=27&season=2024")
rvest::html_element(page, "#ladder-table") |>
  rvest::html_table()
# # A tibble: 17 × 17
#      Pos Team    Pos ``           played points  wins drawn  lost  byes `for` against diff. home   away   form  Next 
#    <int> <lgl> <int> <chr>         <int>  <int> <int> <int> <int> <int> <int>   <int> <int> <chr>  <chr>  <chr> <chr>
#  1     1 NA        1 Storm            24     44    19     0     5     3   692     449   243 10 - 2 9 - 3  4 - 1 "Sha…
#  2     2 NA        2 Panthers         24     40    17     0     7     3   580     394   186 9 - 3  8 - 4  3 - 2 "Roo…
#  3     3 NA        3 Roosters         24     38    16     0     8     3   738     463   275 8 - 4  8 - 4  3 - 1 "Pan…
#  4     4 NA        4 Sharks           24     38    16     0     8     3   653     431   222 8 - 4  8 - 4  4 - 1 "Sto…
#  5     5 NA        5 Cowboys          24     36    15     0     9     3   657     568    89 7 - 5  8 - 4  3 - 1 "Kni…
#  6     6 NA        6 Bulldogs         24     34    14     0    10     3   529     433    96 10 - 2 4 - 8  3 - 2 "Sea…
#  7     7 NA        7 Sea Eagles       24     33    13     1    10     3   634     521   113 9 - 3  4 - 7  3 - 2 "Bul…
#  8     8 NA        8 Knights          24     30    12     0    12     3   470     510   -40 7 - 5  5 - 7  4 - 1 "Cow…
#  9     9 NA        9 Raiders          24     30    12     0    12     3   474     601  -127 7 - 5  5 - 7  3 - 2 ""   
# 10    10 NA       10 Dolphins         24     28    11     0    13     3   577     578    -1 6 - 6  5 - 7  2 - 3 ""   
# 11    11 NA       11 Dragons          24     28    11     0    13     3   508     634  -126 6 - 6  5 - 7  1 - 4 ""   
# 12    12 NA       12 Broncos          24     26    10     0    14     3   537     607   -70 5 - 7  5 - 7  2 - 2 ""   
# 13    13 NA       13 Warriors         24     25     9     1    14     3   512     574   -62 6 - 5  3 - 9  1 - 3 ""   
# 14    14 NA       14 Titans           24     22     8     0    16     3   488     656  -168 4 - 8  4 - 8  0 - 5 ""   
# 15    15 NA       15 Eels             24     20     7     0    17     3   561     716  -155 5 - 7  2 - 10 2 - 3 ""   
# 16    16 NA       16 Rabbitohs        24     20     7     0    17     3   494     682  -188 5 - 7  2 - 10 0 - 5 ""   
# 17    17 NA       17 Wests Tigers     24     18     6     0    18     3   463     750  -287 5 - 7  1 - 11 2 - 2 ""   

In your case, there was nothing to pull:

page <- rvest::read_html("https://www.nrl/ladder/?competition=111&round=27&season=2024")
rvest::html_element(page, "#q-data")
# {xml_missing}
# <NA>
rvest::html_element(page, "q-data")
# {xml_missing}
# <NA>
rvest::html_attr(page, "q-data")
# [1] NA

However ... in this case, the raw data can be found in the static data (I found it from your hint of q-data), so we can do something similar without read_html_live (which does incur a little overhead, in its need for chromote). When looking at the source, I see that there are two components that have an attribute named q-data, and the second is in id="vue-ladder". Let's pull that element first before looking for the attribute:

page <- rvest::read_html("https://www.nrl/ladder/?competition=111&round=27&season=2024")
rvest::html_element(page, "#vue-ladder") |>
  rvest::html_attr("q-data") |>
  jsonlite::fromJSON() |>
  str()
# List of 11
#  $ filterCompetitions   :'data.frame':    5 obs. of  3 variables:
#   ..$ name : chr [1:5] "Telstra Premiership" "Telstra Women's Premiership" "Witzer Pre-Season Challenge" "The Knock On Effect NSW Cup" ...
#   ..$ value: int [1:5] 111 161 119 113 114
#   ..$ theme:'data.frame': 5 obs. of  2 variables:
#   .. ..$ key  : chr [1:5] "nrl-premiership" "nrl-womens-premiership" "pre-season-challenge" "nsw-cup" ...
#   .. ..$ logos:'data.frame':  5 obs. of  9 variables:
#   .. .. ..$ badge-light.png      : chr [1:5] "202502070006" "202502070006" "202502070130" "202502070006" ...
#   .. .. ..$ badge-light.svg      : chr [1:5] "202502070006" "202502070006" "202502070130" "202502070006" ...
#   .. .. ..$ badge.png            : chr [1:5] "202502070006" "202502070006" "202502070130" "202502070006" ...
#   .. .. ..$ badge.svg            : chr [1:5] "202502070006" "202502070006" "202502070130" "202502070006" ...
#   .. .. ..$ header-background.png: chr [1:5] NA NA NA NA ...
#   .. .. ..$ header-background.svg: chr [1:5] NA NA NA NA ...
#   .. .. ..$ silhouette.png       : chr [1:5] NA NA NA NA ...
#   .. .. ..$ silhouette.svg       : chr [1:5] NA NA NA NA ...
#   .. .. ..$ text.svg             : chr [1:5] NA NA NA NA ...
#  $ filterRounds         :'data.frame':    27 obs. of  2 variables:
#   ..$ name : chr [1:27] "Round 1" "Round 2" "Round 3" "Round 4" ...
#   ..$ value: int [1:27] 1 2 3 4 5 6 7 8 9 10 ...
#  $ filterSeasons        :'data.frame':    28 obs. of  2 variables:
#   ..$ name : chr [1:28] "2025" "2024" "2023" "2022" ...
#   ..$ value: int [1:28] 2025 2024 2023 2022 2021 2020 2019 2018 2017 2016 ...
#  $ finalistTeams        : int 8
#  $ positions            :'data.frame':    17 obs. of  6 variables:
#   ..$ clubProfileUrl: chr [1:17] "/clubs/melbourne-storm/" "/clubs/penrith-panthers/" "/clubs/sydney-roosters/" "/clubs/cronulla-sutherland-sharks/" ...
#   ..$ movement      : chr [1:17] "none" "none" "none" "none" ...
#   ..$ next          :'data.frame':    17 obs. of  6 variables:
#   .. ..$ fullName      : chr [1:17] "Cronulla-Sutherland Sharks" "Sydney Roosters" "Penrith Panthers" "Melbourne Storm" ...
#   .. ..$ teamId        : int [1:17] 500028 500001 500014 500021 500003 500002 500010 500012 NA NA ...
#   .. ..$ nickname      : chr [1:17] "Sharks" "Roosters" "Panthers" "Storm" ...
#   .. ..$ theme         :'data.frame': 17 obs. of  2 variables:
#   .. .. ..$ key  : chr [1:17] "sharks" "roosters" "panthers" "storm" ...
#   .. .. ..$ logos:'data.frame':   17 obs. of  12 variables:
#   .. .. .. ..$ badge-basic24-light.svg: chr [1:17] "202502070006" NA NA "202502070006" ...
#   .. .. .. ..$ badge-basic24-mono.svg : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ badge-basic24.svg      : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ badge-light.png        : chr [1:17] "202502070006" NA "202502070006" "202502070006" ...
#   .. .. .. ..$ badge-light.svg        : chr [1:17] "202502070006" NA "202502070006" "202502070006" ...
#   .. .. .. ..$ badge.png              : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ badge.svg              : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ header-background.png  : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ header-background.svg  : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ silhouette.png         : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ silhouette.svg         : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. .. ..$ text.svg               : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. ..$ matchCentreUrl: chr [1:17] "https://www.nrl/draw/nrl-premiership/2024/finals-week-1/game-2/" "https://www.nrl/draw/nrl-premiership/2024/finals-week-1/game-1/" "https://www.nrl/draw/nrl-premiership/2024/finals-week-1/game-1/" "https://www.nrl/draw/nrl-premiership/2024/finals-week-1/game-2/" ...
#   .. ..$ isBye         : logi [1:17] FALSE FALSE FALSE FALSE FALSE FALSE ...
#   ..$ stats         :'data.frame':    17 obs. of  21 variables:
#   .. ..$ played                : int [1:17] 24 24 24 24 24 24 24 24 24 24 ...
#   .. ..$ wins                  : int [1:17] 19 17 16 16 15 14 13 12 12 11 ...
#   .. ..$ drawn                 : int [1:17] 0 0 0 0 0 0 1 0 0 0 ...
#   .. ..$ lost                  : int [1:17] 5 7 8 8 9 10 10 12 12 13 ...
#   .. ..$ byes                  : int [1:17] 3 3 3 3 3 3 3 3 3 3 ...
#   .. ..$ points for            : int [1:17] 692 580 738 653 657 529 634 470 474 577 ...
#   .. ..$ points against        : int [1:17] 449 394 463 431 568 433 521 510 601 578 ...
#   .. ..$ points difference     : int [1:17] 243 186 275 222 89 96 113 -40 -127 -1 ...
#   .. ..$ home record           : chr [1:17] "10 - 2" "9 - 3" "8 - 4" "8 - 4" ...
#   .. ..$ away record           : chr [1:17] "9 - 3" "8 - 4" "8 - 4" "8 - 4" ...
#   .. ..$ points                : int [1:17] 44 40 38 38 36 34 33 30 30 28 ...
#   .. ..$ bonus points          : int [1:17] 0 0 0 0 0 0 0 0 0 0 ...
#   .. ..$ streak                : chr [1:17] "1W" "2W" "1W" "1W" ...
#   .. ..$ form                  : chr [1:17] "4 - 1" "3 - 2" "3 - 1" "4 - 1" ...
#   .. ..$ average losing margin : num [1:17] 5 7.4 6.4 12.1 14.9 12.3 7.7 13.3 19.5 12.2 ...
#   .. ..$ average winning margin: num [1:17] 14.1 14 20.4 19.9 14.9 15.6 14.6 10 8.9 14.4 ...
#   .. ..$ golden point          : int [1:17] 0 1 0 2 2 2 2 2 2 3 ...
#   .. ..$ close games           : int [1:17] 10 9 8 6 8 10 8 9 10 10 ...
#   .. ..$ day record            : chr [1:17] "2-0-0" "6-0-3" "5-0-3" "6-0-0" ...
#   .. ..$ night record          : chr [1:17] "17-0-5" "11-0-4" "11-0-5" "10-0-8" ...
#   .. ..$ players used          : int [1:17] 34 33 28 31 28 32 33 29 30 28 ...
#   ..$ teamNickname  : chr [1:17] "Storm" "Panthers" "Roosters" "Sharks" ...
#   ..$ theme         :'data.frame':    17 obs. of  2 variables:
#   .. ..$ key  : chr [1:17] "storm" "panthers" "roosters" "sharks" ...
#   .. ..$ logos:'data.frame':  17 obs. of  12 variables:
#   .. .. ..$ badge-basic24-light.svg: chr [1:17] "202502070006" NA NA "202502070006" ...
#   .. .. ..$ badge-basic24-mono.svg : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ badge-basic24.svg      : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ badge-light.png        : chr [1:17] "202502070006" "202502070006" NA "202502070006" ...
#   .. .. ..$ badge-light.svg        : chr [1:17] "202502070006" "202502070006" NA "202502070006" ...
#   .. .. ..$ badge.png              : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ badge.svg              : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ header-background.png  : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ header-background.svg  : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ silhouette.png         : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ silhouette.svg         : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#   .. .. ..$ text.svg               : chr [1:17] "202502070006" "202502070006" "202502070006" "202502070006" ...
#  $ selectedCompetitionId: int 111
#  $ selectedRoundId      : int 27
#  $ selectedSeasonId     : int 2024
#  $ showOdds             : logi FALSE
#  $ showPredictor        : logi FALSE
#  $ showBonusPoints      : logi FALSE
发布评论

评论列表(0)

  1. 暂无评论