diff --git a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md
index 97adbabc62..a4efece980 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/12_framework.md
+++ b/sources/academy/webscraping/scraping_basics_javascript/12_framework.md
@@ -417,13 +417,13 @@ If you export the dataset as JSON, it should look something like this:
{CrawleeF1DriversExercise.code}
-### Use Crawlee to find the ratings of the most popular Netflix films
+### Use Crawlee to find the user scores of popular Netflix titles
-The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
+The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data:
-- URL of the film's IMDb page
+- URL of the title's TMDb page
- Title
-- Rating
+- User score
If you export the dataset as JSON, it should look something like this:
@@ -431,27 +431,32 @@ If you export the dataset as JSON, it should look something like this:
```json
[
{
- "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1",
- "title": "The Merry Gentlemen",
- "rating": "5.0/10"
+ "url": "https://www.themoviedb.org/movie/1290417-thrash",
+ "title": "Thrash",
+ "user_score": "59%"
},
{
- "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1",
- "title": "Hot Frosty",
- "rating": "5.4/10"
+ "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson",
+ "title": "The Truth and Tragedy of Moriah Wilson",
+ "user_score": "71%"
+ },
+ {
+ "url": "https://www.themoviedb.org/movie/1234731-anaconda",
+ "title": "Anaconda",
+ "user_score": "59%"
},
...
]
```
-To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this:
+To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this:
```js
import { CheerioCrawler, Request } from 'crawlee';
import { escape } from 'node:querystring';
-const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
-const request = new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });
+const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`;
+const request = new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' });
```
Then use the `addRequests()` function to instruct Crawlee that it should follow an array of these manually constructed requests:
@@ -465,7 +470,7 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => {
:::tip Need a nudge?
-When navigating to the first IMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue.
+When navigating to the first TMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue.
:::
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs
index 162b8dc0da..2b16e4c997 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/crawlee_netflix_ratings.mjs
@@ -8,25 +8,25 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => {
const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5);
const requests = buttons.map((buttonElement) => {
const name = $(buttonElement).text().trim();
- const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
- return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });
+ const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`;
+ return new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' });
});
await addRequests(requests);
});
-crawler.router.addHandler('IMDB_SEARCH', async ({ enqueueLinks }) => {
- await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 });
+crawler.router.addHandler('TMDB_SEARCH', async ({ enqueueLinks }) => {
+ await enqueueLinks({ selector: '.results a', label: 'TMDB', limit: 1 });
});
-crawler.router.addHandler('IMDB', async ({ $, request, pushData }) => {
- const title = $('h1').text().trim();
- const score = $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim();
+crawler.router.addHandler('TMDB', async ({ $, request, pushData }) => {
+ const title = $('.title a').first().text().trim();
+ const userScore = $('.user_score_chart').first().attr('data-percent');
- if (title && score) {
+ if (title && userScore) {
await pushData({
url: request.url,
title,
- rating: score,
+ user_score: `${userScore}%`,
});
}
});
diff --git a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats
index bb6d8cc0dc..d187a5b911 100644
--- a/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats
+++ b/sources/academy/webscraping/scraping_basics_javascript/exercises/test.bats
@@ -125,10 +125,16 @@ teardown_file() {
@test "lists Guardian F1 authors" {
run node guardian_f1_authors.mjs
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
[[ "$output" == *' F1 '* ]]
[[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him)
- [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
- [[ $(echo "$output" | wc -l) -gt 5 ]]
+
+ # check that each line is in the AUTHOR: TITLE format
+ while IFS= read -r line; do
+ [[ "$line" == *': '* ]]
+ [[ "$line" != ': '* ]]
+ [[ "$line" != *': ' ]]
+ done <<< "$output"
}
@test "lists JavaScript GitHub repos with the LLM topic" {
@@ -160,12 +166,12 @@ teardown_file() {
[[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
}
-@test "scrapes Netflix ratings with Crawlee" {
+@test "scrapes Netflix user scores with Crawlee" {
run node crawlee_netflix_ratings.mjs
(( status == 0 ))
[[ -f dataset.json ]]
[[ $(cat dataset.json | jq '. | length') == "5" ]]
- [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]]
- [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
+ [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]]
+ [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]]
}
diff --git a/sources/academy/webscraping/scraping_basics_python/12_framework.md b/sources/academy/webscraping/scraping_basics_python/12_framework.md
index eb7d9425f8..3d89792de2 100644
--- a/sources/academy/webscraping/scraping_basics_python/12_framework.md
+++ b/sources/academy/webscraping/scraping_basics_python/12_framework.md
@@ -468,13 +468,13 @@ If you export the dataset as JSON, it should look something like this:
{CrawleeF1DriversExercise.code}
-### Use Crawlee to find the ratings of the most popular Netflix films
+### Use Crawlee to find the user scores of popular Netflix titles
-The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
+The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, then search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data:
-- URL of the film's IMDb page
+- URL of the title's TMDb page
- Title
-- Rating
+- User score
If you export the dataset as JSON, it should look something like this:
@@ -482,20 +482,25 @@ If you export the dataset as JSON, it should look something like this:
```json
[
{
- "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1",
- "title": "The Merry Gentlemen",
- "rating": "5.0/10"
+ "url": "https://www.themoviedb.org/movie/1290417-thrash",
+ "title": "Thrash",
+ "user_score": "59%"
},
{
- "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1",
- "title": "Hot Frosty",
- "rating": "5.4/10"
+ "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson",
+ "title": "The Truth and Tragedy of Moriah Wilson",
+ "user_score": "71%"
+ },
+ {
+ "url": "https://www.themoviedb.org/movie/1234731-anaconda",
+ "title": "Anaconda",
+ "user_score": "59%"
},
...
]
```
-To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this:
+To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this:
```py
from urllib.parse import quote_plus
@@ -508,8 +513,8 @@ async def main():
requests = []
for name_cell in context.soup.select(...):
name = name_cell.text.strip()
- imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
- requests.append(Request.from_url(imdb_search_url, label="..."))
+ tmdb_search_url = f"https://www.themoviedb.org/search?query={quote_plus(name)}"
+ requests.append(Request.from_url(tmdb_search_url, label="..."))
await context.add_requests(requests)
...
@@ -517,7 +522,7 @@ async def main():
:::tip Need a nudge?
-When navigating to the first IMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue.
+When navigating to the first TMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue.
:::
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py
index 31449cd7fb..aab91a6f8f 100644
--- a/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/crawlee_netflix_ratings.py
@@ -14,30 +14,26 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None:
name_cells = context.soup.select('[data-uia="top10-table-row-title"] button')
for name_cell in name_cells[:5]:
name = name_cell.text.strip()
- imdb_search_url = (
- f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
+ tmdb_search_url = (
+ f"https://www.themoviedb.org/search?query={quote_plus(name)}"
)
- requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH"))
+ requests.append(Request.from_url(tmdb_search_url, label="TMDB_SEARCH"))
await context.add_requests(requests)
- @crawler.router.handler("IMDB_SEARCH")
- async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None:
- await context.enqueue_links(
- selector=".ipc-title-link-wrapper", label="IMDB", limit=1
- )
-
- @crawler.router.handler("IMDB")
- async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None:
- rating_element = context.soup.select_one(
- "[data-testid='hero-rating-bar__aggregate-rating__score']"
- )
- title_element = context.soup.select_one("h1")
- if rating_element and title_element:
+ @crawler.router.handler("TMDB_SEARCH")
+ async def handle_tmdb_search(context: BeautifulSoupCrawlingContext) -> None:
+ await context.enqueue_links(selector=".results a", label="TMDB", limit=1)
+
+ @crawler.router.handler("TMDB")
+ async def handle_tmdb(context: BeautifulSoupCrawlingContext) -> None:
+ score_element = context.soup.select_one(".user_score_chart")
+ title_element = context.soup.select_one(".title a")
+ if score_element and title_element:
await context.push_data(
{
"url": context.request.url,
"title": title_element.text.strip(),
- "rating": rating_element.text.strip(),
+ "user_score": f"{score_element.get('data-percent')}%",
}
)
diff --git a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
index 1a0b2844ed..6d6efa6ea5 100644
--- a/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
+++ b/sources/academy/webscraping/scraping_basics_python/exercises/test.bats
@@ -119,10 +119,16 @@ teardown() {
@test "lists Guardian F1 authors" {
run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py
+ [[ $(echo "$output" | wc -l) -gt 5 ]]
[[ "$output" == *' F1 '* ]]
[[ "$output" == *'Giles Richards: '* ]] # writes most of them (we'll have to change this if they fire him)
- [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
- [[ $(echo "$output" | wc -l) -gt 5 ]]
+
+ # check that each line is in the AUTHOR: TITLE format
+ while IFS= read -r line; do
+ [[ "$line" == *': '* ]]
+ [[ "$line" != ': '* ]]
+ [[ "$line" != *': ' ]]
+ done <<< "$output"
}
@test "lists Python database jobs" {
@@ -150,12 +156,12 @@ teardown() {
[[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
}
-@test "scrapes Netflix ratings with Crawlee" {
+@test "scrapes Netflix user scores with Crawlee" {
run uv run -q --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py
(( status == 0 ))
[[ -f dataset.json ]]
[[ $(cat dataset.json | jq '. | length') -eq 5 ]]
- [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]]
- [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
+ [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]]
+ [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]]
}