Skip to content

Commit

Permalink
Bugfixes (#330)
Browse files Browse the repository at this point in the history
* Fix catalog parser for tamilmv & tamilblasters with ESub

* Avoid scraping zipx torrents

* Fix rpdb logging var, increase title ratio matching & cleanup
  • Loading branch information
mhdzumair authored Oct 19, 2024
1 parent 9ebf4b8 commit 95f57a9
Show file tree
Hide file tree
Showing 6 changed files with 14 additions and 6 deletions.
2 changes: 1 addition & 1 deletion db/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class Settings(BaseSettings):
# Content Filtering
adult_content_regex_keywords: str = (
r"(^|\b|\s|$|[\[._-])"
r"(18\s*\+|adults?|porn|sex|xxx|nude|boobs?|pussy|ass|bigass|bigtits?|blowjob|hardfuck|onlyfans?|naked|hot|milf|slut|doggy|anal|threesome|foursome|erotic|sexy|18\s*plus|trailer|RiffTrax)"
r"(18\s*\+|adults?|porn|sex|xxx|nude|boobs?|pussy|ass|bigass|bigtits?|blowjob|hardfuck|onlyfans?|naked|hot|milf|slut|doggy|anal|threesome|foursome|erotic|sexy|18\s*plus|trailer|RiffTrax|zipx)"
r"(\b|\s|$|[\]._-])"
)

Expand Down
11 changes: 10 additions & 1 deletion mediafusion_scrapy/pipelines/catalog_parse_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,14 @@ def process_item(self, item, spider):
return item
video_type = item["video_type"]
languages = item["languages"]
item["catalog"] = [f"{lang.lower()}_{video_type}" for lang in languages]
torrent_name = item["torrent_name"].lower()
catalogs = []
for language in languages:
if language.lower() == "english" and "eng" not in torrent_name:
# Fix for ESubs torrents
continue
if video_type == "dubbed" and language.lower() == "english":
continue
catalogs.append(f"{language.lower()}_{video_type}")
item["catalog"] = catalogs
return item
2 changes: 1 addition & 1 deletion mediafusion_scrapy/spiders/tgx.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class TgxSpider(scrapy.Spider):
keyword_patterns: re.Pattern
scraped_info_hash_key: str

def __init__(self, scrape_all: str = "True", *args, **kwargs):
def __init__(self, scrape_all: str = "False", *args, **kwargs):
super(TgxSpider, self).__init__(*args, **kwargs)
self.scrape_all = scrape_all.lower() == "true"
self.redis = REDIS_ASYNC_CLIENT
Expand Down
2 changes: 1 addition & 1 deletion scrapers/base_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def validate_title_and_year(
metadata: MediaFusionMetaData,
catalog_type: str,
torrent_title: str,
expected_ratio: int = 85,
expected_ratio: int = 87,
) -> bool:
"""
Validate the title and year of the parsed data against the metadata.
Expand Down
2 changes: 1 addition & 1 deletion scrapers/rpdb.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import asyncio
import logging
import time

import httpx
from sqlalchemy.testing.plugin.plugin_base import logging

from db import schemas
from utils.runtime_const import REDIS_ASYNC_CLIENT
Expand Down
1 change: 0 additions & 1 deletion utils/poster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import aiohttp
from PIL import Image, ImageDraw, ImageFont, UnidentifiedImageError, ImageStat
from imdb import Cinemagoer
from redis.asyncio import Redis

from db.models import MediaFusionMetaData
from scrapers.imdb_data import get_imdb_rating
Expand Down

0 comments on commit 95f57a9

Please sign in to comment.