2
2
from utils .base_plugin import ListScraper
3
3
import bs4
4
4
import requests
5
- import time
6
5
from loguru import logger
7
6
8
7
class Letterboxd (ListScraper ):
@@ -16,7 +15,16 @@ def get_list(list_id, config=None):
16
15
movies = []
17
16
18
17
while True :
19
- r = requests .get (f"https://letterboxd.com/{ list_id } /detail/by/release-earliest/page/{ page_number } /" , headers = {'User-Agent' : 'Mozilla/5.0' })
18
+ print ("Page number: " , page_number )
19
+ watchlist = list_id .endswith ("/watchlist" )
20
+
21
+ if watchlist :
22
+ r = requests .get (f"https://letterboxd.com/{ list_id } /by/release-earliest/page/{ page_number } /" , headers = {'User-Agent' : 'Mozilla/5.0' })
23
+
24
+ list_name = list_id .split ("/" )[0 ] + " Watchlist"
25
+ description = "Watchlist for " + list_id .split ("/" )[0 ]
26
+ else :
27
+ r = requests .get (f"https://letterboxd.com/{ list_id } /detail/by/release-earliest/page/{ page_number } /" , headers = {'User-Agent' : 'Mozilla/5.0' })
20
28
21
29
soup = bs4 .BeautifulSoup (r .text , 'html.parser' )
22
30
@@ -30,22 +38,43 @@ def get_list(list_id, config=None):
30
38
else :
31
39
description = ""
32
40
33
- for movie_soup in soup .find_all ('div' , {'class' : 'film-detail-content' }):
34
- movie_name = movie_soup .find ('h2' , {'class' : 'headline-2 prettify' }).find ('a' ).text
35
- movie_year = movie_soup .find ('small' , {'class' : 'metadata' })
36
- if movie_year is not None :
37
- movie_year = movie_year .text
38
- movie = {"title" : movie_name , "release_year" : movie_year , "media_type" : "movie" }
41
+ if watchlist :
42
+ page = soup .find_all ('li' , {'class' : 'poster-container' })
43
+ else :
44
+ page = soup .find_all ('div' , {'class' : 'film-detail-content' })
39
45
40
- # Find the imdb id
41
- if config .get ("imdb_id_filter" , False ):
42
- r = requests .get (f"https://letterboxd.com{ movie_soup .find ('a' )['href' ]} " , headers = {'User-Agent' : 'Mozilla/5.0' })
46
+ for movie_soup in page :
47
+ if watchlist :
48
+ movie = {"title" : movie_soup .find ('img' ).attrs ['alt' ], "media_type" : "movie" }
49
+ link = movie_soup .find ('div' , {'class' : 'film-poster' })['data-target-link' ]
50
+ else :
51
+ movie = {"title" : movie_soup .find ('h2' , {'class' : 'headline-2 prettify' }).find ('a' ).text , "media_type" : "movie" }
52
+ movie_year = movie_soup .find ('small' , {'class' : 'metadata' })
53
+ if movie_year is not None :
54
+ movie ["release_year" ] = movie_year .text
55
+
56
+ link = movie_soup .find ('a' )['href' ]
57
+
58
+
59
+ if config .get ("imdb_id_filter" , False ) or 'release_year' not in movie :
60
+ logger .info (f"Getting release year and imdb details for: { movie ['title' ]} " )
61
+
62
+ # Find the imdb id and release year
63
+ r = requests .get (f"https://letterboxd.com{ link } " , headers = {'User-Agent' : 'Mozilla/5.0' })
43
64
movie_soup = bs4 .BeautifulSoup (r .text , 'html.parser' )
65
+
44
66
imdb_id = movie_soup .find ("a" , {"data-track-action" :"IMDb" })
67
+ movie_year = movie_soup .find ("div" , {"class" : "releaseyear" })
68
+
45
69
if imdb_id is not None :
46
70
movie ["imdb_id" ] = imdb_id ["href" ].split ("/title/" )[1 ].split ("/" )[0 ]
47
71
48
- movies .append (movie )
72
+ if movie_year is not None :
73
+ movie ["release_year" ] = movie_year .text
74
+
75
+ # If a movie doesn't have a year, that means that the movie is only just announced and we don't even know when it's coming out. We can easily ignore these because movies will have a year of release by the time they come out.
76
+ if 'release_year' in movie :
77
+ movies .append (movie )
49
78
50
79
if soup .find ('a' , {'class' : 'next' }):
51
80
page_number += 1
0 commit comments