apify · honzajavorek · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
@@ -417,41 +417,46 @@ If you export the dataset as JSON, it should look something like this:
   <CodeBlock language="js">{CrawleeF1DriversExercise.code}</CodeBlock>
 </details>
 
-### Use Crawlee to find the ratings of the most popular Netflix films
+### Use Crawlee to find the user scores of popular Netflix titles
 
-The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
+The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data:
 
-- URL of the film's IMDb page
+- URL of the title's TMDb page
 - Title
-- Rating
+- User score
 
 If you export the dataset as JSON, it should look something like this:
 
 <!-- eslint-skip -->
 ```json
 [
   {
-    "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1",
-    "title": "The Merry Gentlemen",
-    "rating": "5.0/10"
+    "url": "https://www.themoviedb.org/movie/1290417-thrash",
+    "title": "Thrash",
+    "user_score": "59%"
   },
   {
-    "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1",
-    "title": "Hot Frosty",
-    "rating": "5.4/10"
+    "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson",
+    "title": "The Truth and Tragedy of Moriah Wilson",
+    "user_score": "71%"
+  },
+  {
+    "url": "https://www.themoviedb.org/movie/1234731-anaconda",
+    "title": "Anaconda",
+    "user_score": "59%"
   },
   ...
 ]
 ```
 
-To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this:
+To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this:
 
 ```js
 import { CheerioCrawler, Request } from 'crawlee';
 import { escape } from 'node:querystring';
 
-const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
-const request = new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });
+const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`;
+const request = new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' });
 ```
 
 Then use the `addRequests()` function to instruct Crawlee that it should follow an array of these manually constructed requests:
@@ -465,7 +470,7 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => {
 
 :::tip Need a nudge?
 
-When navigating to the first IMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue.
+When navigating to the first TMDb search result, you might find it helpful to know that `enqueueLinks()` accepts a `limit` option, letting you specify the max number of HTTP requests to enqueue.
 
 :::
 

@@ -8,25 +8,25 @@ crawler.router.addDefaultHandler(async ({ $, addRequests }) => {
   const buttons = $("[data-uia='top10-table-row-title'] button").toArray().slice(0, 5);
   const requests = buttons.map((buttonElement) => {
     const name = $(buttonElement).text().trim();
-    const imdbSearchUrl = `https://www.imdb.com/find/?q=${escape(name)}&s=tt&ttype=ft`;
-    return new Request({ url: imdbSearchUrl, label: 'IMDB_SEARCH' });
+    const tmdbSearchUrl = `https://www.themoviedb.org/search?query=${escape(name)}`;
+    return new Request({ url: tmdbSearchUrl, label: 'TMDB_SEARCH' });
   });
   await addRequests(requests);
 });
 
-crawler.router.addHandler('IMDB_SEARCH', async ({ enqueueLinks }) => {
-  await enqueueLinks({ selector: '.ipc-title-link-wrapper', label: 'IMDB', limit: 1 });
+crawler.router.addHandler('TMDB_SEARCH', async ({ enqueueLinks }) => {
+  await enqueueLinks({ selector: '.results a', label: 'TMDB', limit: 1 });
 });
 
-crawler.router.addHandler('IMDB', async ({ $, request, pushData }) => {
-  const title = $('h1').text().trim();
-  const score = $("[data-testid='hero-rating-bar__aggregate-rating__score']").first().text().trim();
+crawler.router.addHandler('TMDB', async ({ $, request, pushData }) => {
+  const title = $('.title a').first().text().trim();
+  const userScore = $('.user_score_chart').first().attr('data-percent');
 
-  if (title && score) {
+  if (title && userScore) {
     await pushData({
       url: request.url,
       title,
-      rating: score,
+      user_score: `${userScore}%`,
     });
   }
 });

@@ -125,10 +125,16 @@ teardown_file() {
 @test "lists Guardian F1 authors" {
   run node guardian_f1_authors.mjs
 
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
   [[ "$output" == *' F1 '* ]]
   [[ "$output" == *'Giles Richards: '* ]]  # writes most of them (we'll have to change this if they fire him)
-  [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
-  [[ $(echo "$output" | wc -l) -gt 5 ]]
+
+  # check that each line is in the AUTHOR: TITLE format
+  while IFS= read -r line; do
+    [[ "$line" == *': '* ]]
+    [[ "$line" != ': '* ]]
+    [[ "$line" != *': ' ]]
+  done <<< "$output"
 }
 
 @test "lists JavaScript GitHub repos with the LLM topic" {
@@ -160,12 +166,12 @@ teardown_file() {
   [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
 }
 
-@test "scrapes Netflix ratings with Crawlee" {
+@test "scrapes Netflix user scores with Crawlee" {
   run node crawlee_netflix_ratings.mjs
 
   (( status == 0 ))
   [[ -f dataset.json ]]
   [[ $(cat dataset.json | jq '. | length') == "5" ]]
-  [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]]
-  [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
+  [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]]
+  [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]]
 }
@@ -468,34 +468,39 @@ If you export the dataset as JSON, it should look something like this:
   <CodeBlock language="py">{CrawleeF1DriversExercise.code}</CodeBlock>
 </details>
 
-### Use Crawlee to find the ratings of the most popular Netflix films
+### Use Crawlee to find the user scores of popular Netflix titles
 
-The [Global Top 10](https://www.netflix.com/tudum/top10) page has a table listing the most popular Netflix films worldwide. Scrape the first 5 movie names from this page, then search for each movie on [IMDb](https://www.imdb.com/). Assume the first search result is correct and retrieve the film's rating. Each item you push to Crawlee's default dataset should include the following data:
+The [Global Top 10](https://www.netflix.com/tudum/top10) page has tables listing popular Netflix titles worldwide. Scrape the first 5 title names from this page, then search for each title on [TMDb](https://www.themoviedb.org/). Assume the first search result is correct and retrieve the title's user score. Each item you push to Crawlee's default dataset should include the following data:
 
-- URL of the film's IMDb page
+- URL of the title's TMDb page
 - Title
-- Rating
+- User score
 
 If you export the dataset as JSON, it should look something like this:
 
 <!-- eslint-skip -->
 ```json
 [
   {
-    "url": "https://www.imdb.com/title/tt32368345/?ref_=fn_tt_tt_1",
-    "title": "The Merry Gentlemen",
-    "rating": "5.0/10"
+    "url": "https://www.themoviedb.org/movie/1290417-thrash",
+    "title": "Thrash",
+    "user_score": "59%"
   },
   {
-    "url": "https://www.imdb.com/title/tt32359447/?ref_=fn_tt_tt_1",
-    "title": "Hot Frosty",
-    "rating": "5.4/10"
+    "url": "https://www.themoviedb.org/movie/1629369-the-truth-and-tragedy-of-moriah-wilson",
+    "title": "The Truth and Tragedy of Moriah Wilson",
+    "user_score": "71%"
+  },
+  {
+    "url": "https://www.themoviedb.org/movie/1234731-anaconda",
+    "title": "Anaconda",
+    "user_score": "59%"
   },
   ...
 ]
 ```
 
-To scrape IMDb data, you'll need to construct a `Request` object with the appropriate search URL for each movie title. The following code snippet gives you an idea of how to do this:
+To scrape TMDb data, you'll need to construct a `Request` object with the appropriate search URL for each title name. The following code snippet gives you an idea of how to do this:
 
 ```py
 from urllib.parse import quote_plus
@@ -508,16 +513,16 @@ async def main():
         requests = []
         for name_cell in context.soup.select(...):
             name = name_cell.text.strip()
-            imdb_search_url = f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
-            requests.append(Request.from_url(imdb_search_url, label="..."))
+            tmdb_search_url = f"https://www.themoviedb.org/search?query={quote_plus(name)}"
+            requests.append(Request.from_url(tmdb_search_url, label="..."))
         await context.add_requests(requests)
 
     ...
 ```
 
 :::tip Need a nudge?
 
-When navigating to the first IMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue.
+When navigating to the first TMDb search result, you might find it helpful to know that `context.enqueue_links()` accepts a `limit` keyword argument, letting you specify the max number of HTTP requests to enqueue.
 
 :::
 

@@ -14,30 +14,26 @@ async def handle_netflix_table(context: BeautifulSoupCrawlingContext) -> None:
         name_cells = context.soup.select('[data-uia="top10-table-row-title"] button')
         for name_cell in name_cells[:5]:
             name = name_cell.text.strip()
-            imdb_search_url = (
-                f"https://www.imdb.com/find/?q={quote_plus(name)}&s=tt&ttype=ft"
+            tmdb_search_url = (
+                f"https://www.themoviedb.org/search?query={quote_plus(name)}"
             )
-            requests.append(Request.from_url(imdb_search_url, label="IMDB_SEARCH"))
+            requests.append(Request.from_url(tmdb_search_url, label="TMDB_SEARCH"))
         await context.add_requests(requests)
 
-    @crawler.router.handler("IMDB_SEARCH")
-    async def handle_imdb_search(context: BeautifulSoupCrawlingContext) -> None:
-        await context.enqueue_links(
-            selector=".ipc-title-link-wrapper", label="IMDB", limit=1
-        )
-
-    @crawler.router.handler("IMDB")
-    async def handle_imdb(context: BeautifulSoupCrawlingContext) -> None:
-        rating_element = context.soup.select_one(
-            "[data-testid='hero-rating-bar__aggregate-rating__score']"
-        )
-        title_element = context.soup.select_one("h1")
-        if rating_element and title_element:
+    @crawler.router.handler("TMDB_SEARCH")
+    async def handle_tmdb_search(context: BeautifulSoupCrawlingContext) -> None:
+        await context.enqueue_links(selector=".results a", label="TMDB", limit=1)
+
+    @crawler.router.handler("TMDB")
+    async def handle_tmdb(context: BeautifulSoupCrawlingContext) -> None:
+        score_element = context.soup.select_one(".user_score_chart")
+        title_element = context.soup.select_one(".title a")
+        if score_element and title_element:
             await context.push_data(
                 {
                     "url": context.request.url,
                     "title": title_element.text.strip(),
-                    "rating": rating_element.text.strip(),
+                    "user_score": f"{score_element.get('data-percent')}%",
                 }
             )
 

@@ -119,10 +119,16 @@ teardown() {
 @test "lists Guardian F1 authors" {
   run uv run -q --with=httpx --with=beautifulsoup4 python guardian_f1_authors.py
 
+  [[ $(echo "$output" | wc -l) -gt 5 ]]
   [[ "$output" == *' F1 '* ]]
   [[ "$output" == *'Giles Richards: '* ]]  # writes most of them (we'll have to change this if they fire him)
-  [[ "$output" == *'Guardian sport: '* || "$output" == *'PM Media: '* ]]
-  [[ $(echo "$output" | wc -l) -gt 5 ]]
+
+  # check that each line is in the AUTHOR: TITLE format
+  while IFS= read -r line; do
+    [[ "$line" == *': '* ]]
+    [[ "$line" != ': '* ]]
+    [[ "$line" != *': ' ]]
+  done <<< "$output"
 }
 
 @test "lists Python database jobs" {
@@ -150,12 +156,12 @@ teardown() {
   [[ $(cat dataset.json | jq '.[].url') == *"https://www.f1academy.com/Racing-Series/Drivers/"* ]]
 }
 
-@test "scrapes Netflix ratings with Crawlee" {
+@test "scrapes Netflix user scores with Crawlee" {
   run uv run -q --with=crawlee[beautifulsoup] python crawlee_netflix_ratings.py
 
   (( status == 0 ))
   [[ -f dataset.json ]]
   [[ $(cat dataset.json | jq '. | length') -eq 5 ]]
-  [[ $(cat dataset.json | jq -c '.[0] | keys') == '["rating","title","url"]' ]]
-  [[ $(cat dataset.json | jq '.[].url') == *"https://www.imdb.com/title/"* ]]
+  [[ $(cat dataset.json | jq -c '.[0] | keys') == '["title","url","user_score"]' ]]
+  [[ $(cat dataset.json | jq '.[].url') == *"https://www.themoviedb.org/"* ]]
 }