diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md index 97adbabc62..a4efece980 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md @@ -417,13 +417,13 @@ If you export the dataset as JSON, it should look something like this: {CrawleeF1DriversExercise.code} -### Use Crawlee to find the ratings of the most popular Netflix films +### Use Crawlee to find the user scores of popular Netflix titles -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data: -- URL of the film's IMDb page +- URL of the title's TMDb page - Title -- Rating +- User score If you export the dataset as JSON, it should look something like this: @@ -431,27 +431,32 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1", - "title": "The Merry Gentlemen", - "rating": "5.0/10" + "url": "https://www.themoviedb.org/movie/1290417-thrash", + "title": "Thrash", + "user_score": "59%" }, { - "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1", - "title": "Hot Frosty", - "rating": "5.4/10" + "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson", + "title": "The Truth and Tragedy of Moriah Wilson", + "user_score": "71%" + }, + { + "url": "https://www.themoviedb.org/movie/1234731-anaconda", + "title": "Anaconda", + "user_score": "59%" }, ... ] ``` -To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this: +To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this: ```js import { CheerioCrawler, Request } from 'crawlee'; import { escape } from 'node:querystring'; -const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; -const request = new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); +const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`; +const request = new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' }); ``` Then use the `addRequests()` function to instruct Crawlee that it should follow an array of these manually constructed requests: @@ -465,7 +470,7 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => { :::tip Need a nudge? -When navigating to the first IMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue. +When navigating to the first TMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue. ::: diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs index 162b8dc0da..2b16e4c997 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs @@ -8,25 +8,25 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => { const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5); const requests = buttons.map((buttonElement) => { const name = $(buttonElement).text().trim(); - const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`; - return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' }); + const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`; + return new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' }); }); await addRequests(requests); }); -crawler.router.addHandler('IMDB_SEARCH', async ({ enqueueLinks }) => { - await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 }); +crawler.router.addHandler('TMDB_SEARCH', async ({ enqueueLinks }) => { + await enqueueLinks({ selector: '.results a', label: 'TMDB', limit: 1 }); }); -crawler.router.addHandler('IMDB', async ({ $, request, pushData }) => { - const title = $('h1').text().trim(); - const score = $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim(); +crawler.router.addHandler('TMDB', async ({ $, request, pushData }) => { + const title = $('.title a').first().text().trim(); + const userScore = $('.user_score_chart').first().attr('data-percent'); - if (title && score) { + if (title && userScore) { await pushData({ url: request.url, title, - rating: score, + user_score: `${userScore}%`, }); } }); diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats index bb6d8cc0dc..d187a5b911 100644 --- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats @@ -125,10 +125,16 @@ teardown_file() { @test "lists Guardian F1 authors" { run node guardian_f1_authors.mjs + [[ $(echo "$output" | wc -l) -gt 5 ]] [[ "$output" == *' F1 '* ]] [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) - [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + + # check that each line is in the AUTHOR: TITLE format + while IFS= read -r line; do + [[ "$line" == *': '* ]] + [[ "$line" != ': '* ]] + [[ "$line" != *': ' ]] + done <<< "$output" } @test "lists JavaScript GitHub repos with the LLM topic" { @@ -160,12 +166,12 @@ teardown_file() { [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } -@test "scrapes Netflix ratings with Crawlee" { +@test "scrapes Netflix user scores with Crawlee" { run node crawlee_netflix_ratings.mjs (( status == 0 )) [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') == "5" ]] - [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] - [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]] } diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md index eb7d9425f8..3d89792de2 100644 --- a/sources/academy/webscraping/scraping_basics_python/12_framework.md +++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md @@ -468,13 +468,13 @@ If you export the dataset as JSON, it should look something like this: {CrawleeF1DriversExercise.code} -### Use Crawlee to find the ratings of the most popular Netflix films +### Use Crawlee to find the user scores of popular Netflix titles -The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data: +The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, then search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data: -- URL of the film's IMDb page +- URL of the title's TMDb page - Title -- Rating +- User score If you export the dataset as JSON, it should look something like this: @@ -482,20 +482,25 @@ If you export the dataset as JSON, it should look something like this: ```json [ { - "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1", - "title": "The Merry Gentlemen", - "rating": "5.0/10" + "url": "https://www.themoviedb.org/movie/1290417-thrash", + "title": "Thrash", + "user_score": "59%" }, { - "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1", - "title": "Hot Frosty", - "rating": "5.4/10" + "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson", + "title": "The Truth and Tragedy of Moriah Wilson", + "user_score": "71%" + }, + { + "url": "https://www.themoviedb.org/movie/1234731-anaconda", + "title": "Anaconda", + "user_score": "59%" }, ... ] ``` -To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this: +To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this: ```py from urllib.parse import quote_plus @@ -508,8 +513,8 @@ async def main(): requests = [] for name_cell in context.soup.select(...): name = name_cell.text.strip() - imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" - requests.append(Request.from_url(imdb_search_url, label="...")) + tmdb_search_url = f"https://www.themoviedb.org/search?query={quote_plus(name)}" + requests.append(Request.from_url(tmdb_search_url, label="...")) await context.add_requests(requests) ... @@ -517,7 +522,7 @@ async def main(): :::tip Need a nudge? -When navigating to the first IMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue. +When navigating to the first TMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue. ::: diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py index 31449cd7fb..aab91a6f8f 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py +++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py @@ -14,30 +14,26 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None: name_cells = context.soup.select('[data-uia="top10-table-row-title"] button') for name_cell in name_cells[:5]: name = name_cell.text.strip() - imdb_search_url = ( - f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft" + tmdb_search_url = ( + f"https://www.themoviedb.org/search?query={quote_plus(name)}" ) - requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH")) + requests.append(Request.from_url(tmdb_search_url, label="TMDB_SEARCH")) await context.add_requests(requests) - @crawler.router.handler("IMDB_SEARCH") - async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None: - await context.enqueue_links( - selector=".ipc-title-link-wrapper", label="IMDB", limit=1 - ) - - @crawler.router.handler("IMDB") - async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None: - rating_element = context.soup.select_one( - "[data-testid='hero-rating-bar__aggregate-rating__score']" - ) - title_element = context.soup.select_one("h1") - if rating_element and title_element: + @crawler.router.handler("TMDB_SEARCH") + async def handle_tmdb_search(context: BeautifulSoupCrawlingContext) -> None: + await context.enqueue_links(selector=".results a", label="TMDB", limit=1) + + @crawler.router.handler("TMDB") + async def handle_tmdb(context: BeautifulSoupCrawlingContext) -> None: + score_element = context.soup.select_one(".user_score_chart") + title_element = context.soup.select_one(".title a") + if score_element and title_element: await context.push_data( { "url": context.request.url, "title": title_element.text.strip(), - "rating": rating_element.text.strip(), + "user_score": f"{score_element.get('data-percent')}%", } ) diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats index 1a0b2844ed..6d6efa6ea5 100644 --- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats +++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats @@ -119,10 +119,16 @@ teardown() { @test "lists Guardian F1 authors" { run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py + [[ $(echo "$output" | wc -l) -gt 5 ]] [[ "$output" == *' F1 '* ]] [[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him) - [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]] - [[ $(echo "$output" | wc -l) -gt 5 ]] + + # check that each line is in the AUTHOR: TITLE format + while IFS= read -r line; do + [[ "$line" == *': '* ]] + [[ "$line" != ': '* ]] + [[ "$line" != *': ' ]] + done <<< "$output" } @test "lists Python database jobs" { @@ -150,12 +156,12 @@ teardown() { [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]] } -@test "scrapes Netflix ratings with Crawlee" { +@test "scrapes Netflix user scores with Crawlee" { run uv run -q --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py (( status == 0 )) [[ -f dataset.json ]] [[ $(cat dataset.json | jq '. | length') -eq 5 ]] - [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]] - [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]] + [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]] + [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]] }