From fbba954227353e6eca5e694912fc6f5f1baf7243 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 2 Apr 2026 18:50:28 +0200 Subject: [PATCH 1/7] fix: only check for structure, not for data --- .../scraping_basics_javascript/exercises/test.bats | 7 +++++-- .../webscraping/scraping_basics_python/exercises/test.bats | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index bb6d8cc0dc..fad40efbaf 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -126,8 +126,11 @@ teardown_file() { run node guardian_f1_authors.mjs [[ "$output" == *' F1 '* ]] - [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) - [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] + while IFS= read -r line; do + [[ "$line" == *': '* ]] + [[ "$line" != ': '* ]] + [[ "$line" != *': ' ]] + done <<< "$output" [[ $(echo "$output" | wc -l) -gt 5 ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 1a0b2844ed..7e49e96f9d 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -120,8 +120,11 @@ teardown() { run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py [[ "$output" == *' F1 '* ]] - [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) - [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] + while IFS= read -r line; do + [[ "$line" == *': '* ]] + [[ "$line" != ': '* ]] + [[ "$line" != *': ' ]] + done <<< "$output" [[ $(echo "$output" | wc -l) -gt 5 ]] } From 493eb59e277e2136a72227bd140037a4b0141ee8 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 2 Apr 2026 18:51:09 +0200 Subject: [PATCH 2/7] add comment --- .../webscraping/scraping_basics_javascript/exercises/test.bats | 1 + .../webscraping/scraping_basics_python/exercises/test.bats | 1 + 2 files changed, 2 insertions(+) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index fad40efbaf..019f48e5c8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -125,6 +125,7 @@ teardown_file() { @test "lists Guardian F1 authors" { run node guardian_f1_authors.mjs + # check that each line is in the AUTHOR: TITLE format [[ "$output" == *' F1 '* ]] while IFS= read -r line; do [[ "$line" == *': '* ]] diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 7e49e96f9d..16eb2ab675 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -119,6 +119,7 @@ teardown() { @test "lists Guardian F1 authors" { run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py + # check that each line is in the AUTHOR: TITLE format [[ "$output" == *' F1 '* ]] while IFS= read -r line; do [[ "$line" == *': '* ]] From 8c060fe8eda2d3ba5eadc6f736ea886618c51acf Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 2 Apr 2026 18:52:46 +0200 Subject: [PATCH 3/7] one more rework of 'lists Guardian F1 authors' assertions --- .../scraping_basics_javascript/exercises/test.bats | 6 ++++-- .../webscraping/scraping_basics_python/exercises/test.bats | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index 019f48e5c8..cdb0286025 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -125,14 +125,16 @@ teardown_file() { @test "lists Guardian F1 authors" { run node guardian_f1_authors.mjs - # check that each line is in the AUTHOR: TITLE format + [[ $(echo "$output" | wc -l) -gt 5 ]] [[ "$output" == *' F1 '* ]] + [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) + + # check that each line is in the AUTHOR: TITLE format while IFS= read -r line; do [[ "$line" == *': '* ]] [[ "$line" != ': '* ]] [[ "$line" != *': ' ]] done <<< "$output" - [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists JavaScript GitHub repos with the LLM topic" { diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 16eb2ab675..1c833952f9 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -119,14 +119,16 @@ teardown() { @test "lists Guardian F1 authors" { run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py - # check that each line is in the AUTHOR: TITLE format + [[ $(echo "$output" | wc -l) -gt 5 ]] [[ "$output" == *' F1 '* ]] + [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) + + # check that each line is in the AUTHOR: TITLE format while IFS= read -r line; do [[ "$line" == *': '* ]] [[ "$line" != ': '* ]] [[ "$line" != *': ' ]] done <<< "$output" - [[ $(echo "$output" | wc -l) -gt 5 ]] } @test "lists Python database jobs" { From fb3640b8035c944d2b15c737a08d68d1d3cf7ea9 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 2 Apr 2026 19:48:15 +0200 Subject: [PATCH 4/7] replace IMDb with TMDB --- .../12_framework.md | 24 +++++++-------- .../exercises/crawlee_netflix_ratings.mjs | 18 +++++------ .../exercises/test.bats | 6 ++-- .../scraping_basics_python/12_framework.md | 26 ++++++++-------- .../exercises/crawlee_netflix_ratings.py | 30 ++++++++----------- .../exercises/test.bats | 6 ++-- 6 files changed, 53 insertions(+), 57 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index 97adbabc62..603ee5073b 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -417,13 +417,13 @@ If you export the dataset as JSON, it should look something like this: {CrawleeF1DriversExercise.code} -### Use Crawlee to find the ratings of the most popular Netflix films +### Use Crawlee to find the user scores of popular Netflix titles -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data: -- URL of the film's IMDb page +- URL of the title's TMDb page - Title -- Rating +- User score If you export the dataset as JSON, it should look something like this: @@ -431,27 +431,27 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1", + "url": "https://www.themoviedb.org/movie/1278263-the-merry-gentlemen", "title": "The Merry Gentlemen", - "rating": "5.0/10" + "user_score": "61%" }, { - "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1", + "url": "https://www.themoviedb.org/movie/1156593-hot-frosty", "title": "Hot Frosty", - "rating": "5.4/10" + "user_score": "61%" }, ... ] ``` -To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this: +To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this: ```js import { CheerioCrawler, Request } from 'crawlee'; import { escape } from 'node:querystring'; -const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; -const request = new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); +const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`; +const request = new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' }); ``` Then use the `addRequests()` function to instruct Crawlee that it should follow an array of these manually constructed requests: @@ -465,7 +465,7 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => { :::tip Need a nudge? -When navigating to the first IMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue. +When navigating to the first TMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue. ::: diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index 162b8dc0da..c495515f99 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -8,25 +8,25 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => { const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5); const requests = buttons.map((buttonElement) => { const name = $(buttonElement).text().trim(); - const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; - return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); + const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`; + return new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' }); }); await addRequests(requests); }); -crawler.router.addHandler('IMDB_SEARCH', async ({ enqueueLinks }) => { - await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 }); +crawler.router.addHandler('TMDB_SEARCH', async ({ enqueueLinks }) => { + await enqueueLinks({ selector: '.title a.result', label: 'TMDB', limit: 1 }); }); -crawler.router.addHandler('IMDB', async ({ $, request, pushData }) => { - const title = $('h1').text().trim(); - const score = $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim(); +crawler.router.addHandler('TMDB', async ({ $, request, pushData }) => { + const title = $('.title a').first().text().trim(); + const userScore = $('.user_score_chart').first().attr('data-percent'); - if (title && score) { + if (title && userScore) { await pushData({ url: request.url, title, - rating: score, + user_score: `${userScore}%`, }); } }); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index cdb0286025..d187a5b911 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -166,12 +166,12 @@ teardown_file() { [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } -@test "scrapes Netflix ratings with Crawlee" { +@test "scrapes Netflix user scores with Crawlee" { run node crawlee_netflix_ratings.mjs (( status == 0 )) [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') == "5" ]] - [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] - [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index eb7d9425f8..2c0b8fdbf9 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -468,13 +468,13 @@ If you export the dataset as JSON, it should look something like this: {CrawleeF1DriversExercise.code} -### Use Crawlee to find the ratings of the most popular Netflix films +### Use Crawlee to find the user scores of popular Netflix titles -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, then search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data: -- URL of the film's IMDb page +- URL of the title's TMDb page - Title -- Rating +- User score If you export the dataset as JSON, it should look something like this: @@ -482,20 +482,20 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1", - "title": "The Merry Gentlemen", - "rating": "5.0/10" + "url": "https://www.themoviedb.org/movie/1278263-the-merry-gentlemen", + "title": "The Merry Gentlemen", + "user_score": "61%" }, { - "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1", + "url": "https://www.themoviedb.org/movie/1156593-hot-frosty", "title": "Hot Frosty", - "rating": "5.4/10" + "user_score": "61%" }, ... ] ``` -To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this: +To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this: ```py from urllib.parse import quote_plus @@ -508,8 +508,8 @@ async def main(): requests = [] for name_cell in context.soup.select(...): name = name_cell.text.strip() - imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" - requests.append(Request.from_url(imdb_search_url, label="...")) + tmdb_search_url = f"https://www.themoviedb.org/search?query={quote_plus(name)}" + requests.append(Request.from_url(tmdb_search_url, label="...")) await context.add_requests(requests) ... @@ -517,7 +517,7 @@ async def main(): :::tip Need a nudge? -When navigating to the first IMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue. +When navigating to the first TMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue. ::: diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index 31449cd7fb..2389542b18 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -14,30 +14,26 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: name_cells = context.soup.select('[data-uia="top10-table-row-title"] button') for name_cell in name_cells[:5]: name = name_cell.text.strip() - imdb_search_url = ( - f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" + tmdb_search_url = ( + f"https://www.themoviedb.org/search?query={quote_plus(name)}" ) - requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH")) + requests.append(Request.from_url(tmdb_search_url, label="TMDB_SEARCH")) await context.add_requests(requests) - @crawler.router.handler("IMDB_SEARCH") - async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None: - await context.enqueue_links( - selector=".ipc-title-link-wrapper", label="IMDB", limit=1 - ) - - @crawler.router.handler("IMDB") - async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: - rating_element = context.soup.select_one( - "[data-testid='hero-rating-bar__aggregate-rating__score']" - ) - title_element = context.soup.select_one("h1") - if rating_element and title_element: + @crawler.router.handler("TMDB_SEARCH") + async def handle_tmdb_search(context: BeautifulSoupCrawlingContext) -> None: + await context.enqueue_links(selector=".title a.result", label="TMDB", limit=1) + + @crawler.router.handler("TMDB") + async def handle_tmdb(context: BeautifulSoupCrawlingContext) -> None: + score_element = context.soup.select_one(".user_score_chart") + title_element = context.soup.select_one(".title a") + if score_element and title_element: await context.push_data( { "url": context.request.url, "title": title_element.text.strip(), - "rating": rating_element.text.strip(), + "user_score": f"{score_element.get('data-percent')}%", } ) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 1c833952f9..6d6efa6ea5 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -156,12 +156,12 @@ teardown() { [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } -@test "scrapes Netflix ratings with Crawlee" { +@test "scrapes Netflix user scores with Crawlee" { run uv run -q --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py (( status == 0 )) [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') -eq 5 ]] - [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] - [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]] } From 339fe6de3e4f9101c3c9cb65c50f911e1a4e42ec Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 2 Apr 2026 19:52:16 +0200 Subject: [PATCH 5/7] fix indentation --- .../webscraping/scraping_basics_python/12_framework.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index 2c0b8fdbf9..bf8f26810c 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -482,14 +482,14 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.themoviedb.org/movie/1278263-the-merry-gentlemen", - "title": "The Merry Gentlemen", - "user_score": "61%" + "url": "https://www.themoviedb.org/movie/1278263-the-merry-gentlemen", + "title": "The Merry Gentlemen", + "user_score": "61%" }, { - "url": "https://www.themoviedb.org/movie/1156593-hot-frosty", + "url": "https://www.themoviedb.org/movie/1156593-hot-frosty", "title": "Hot Frosty", - "user_score": "61%" + "user_score": "61%" }, ... ] From d2c4741e2a187b1a3adc2111fc42d68ac1288f4f Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 16 Apr 2026 20:11:31 +0200 Subject: [PATCH 6/7] fix selector --- .../exercises/crawlee_netflix_ratings.mjs | 2 +- .../scraping_basics_python/exercises/crawlee_netflix_ratings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index c495515f99..2b16e4c997 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -15,7 +15,7 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => { }); crawler.router.addHandler('TMDB_SEARCH', async ({ enqueueLinks }) => { - await enqueueLinks({ selector: '.title a.result', label: 'TMDB', limit: 1 }); + await enqueueLinks({ selector: '.results a', label: 'TMDB', limit: 1 }); }); crawler.router.addHandler('TMDB', async ({ $, request, pushData }) => { diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index 2389542b18..aab91a6f8f 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -22,7 +22,7 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: @crawler.router.handler("TMDB_SEARCH") async def handle_tmdb_search(context: BeautifulSoupCrawlingContext) -> None: - await context.enqueue_links(selector=".title a.result", label="TMDB", limit=1) + await context.enqueue_links(selector=".results a", label="TMDB", limit=1) @crawler.router.handler("TMDB") async def handle_tmdb(context: BeautifulSoupCrawlingContext) -> None: From 30f8d42d2eb6b3f31b6b592a44ee0bf3a420c90d Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Thu, 16 Apr 2026 20:11:48 +0200 Subject: [PATCH 7/7] update examples --- .../scraping_basics_javascript/12_framework.md | 17 +++++++++++------ .../scraping_basics_python/12_framework.md | 17 +++++++++++------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index 603ee5073b..a4efece980 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -431,14 +431,19 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.themoviedb.org/movie/1278263-the-merry-gentlemen", - "title": "The Merry Gentlemen", - "user_score": "61%" + "url": "https://www.themoviedb.org/movie/1290417-thrash", + "title": "Thrash", + "user_score": "59%" }, { - "url": "https://www.themoviedb.org/movie/1156593-hot-frosty", - "title": "Hot Frosty", - "user_score": "61%" + "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson", + "title": "The Truth and Tragedy of Moriah Wilson", + "user_score": "71%" + }, + { + "url": "https://www.themoviedb.org/movie/1234731-anaconda", + "title": "Anaconda", + "user_score": "59%" }, ... ] diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index bf8f26810c..3d89792de2 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -482,14 +482,19 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.themoviedb.org/movie/1278263-the-merry-gentlemen", - "title": "The Merry Gentlemen", - "user_score": "61%" + "url": "https://www.themoviedb.org/movie/1290417-thrash", + "title": "Thrash", + "user_score": "59%" }, { - "url": "https://www.themoviedb.org/movie/1156593-hot-frosty", - "title": "Hot Frosty", - "user_score": "61%" + "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson", + "title": "The Truth and Tragedy of Moriah Wilson", + "user_score": "71%" + }, + { + "url": "https://www.themoviedb.org/movie/1234731-anaconda", + "title": "Anaconda", + "user_score": "59%" }, ... ]