Skip to content

Commit

Permalink
Analyze subtitle files with PyAV
Browse files Browse the repository at this point in the history
  • Loading branch information
WyattBlue committed Jun 22, 2024
1 parent 3a46c26 commit 758b215
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 17 deletions.
70 changes: 53 additions & 17 deletions auto_editor/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
from dataclasses import dataclass
from fractions import Fraction
from typing import TYPE_CHECKING

import numpy as np
Expand All @@ -25,6 +26,7 @@
pAttr,
pAttrs,
)
from auto_editor.utils.subtitle_tools import convert_ass_to_text
from auto_editor.wavfile import read

if TYPE_CHECKING:
Expand Down Expand Up @@ -307,31 +309,65 @@ def subtitle(
except re.error as e:
self.log.error(e)

sub_file = self.ensure.subtitle(self.src, stream)
parser = SubtitleParser(self.tb)
import av

with open(sub_file, encoding="utf-8") as file:
parser.parse(file.read(), "webvtt")
try:
container = av.open(self.src.path, "r")
subtitle_stream = container.streams.subtitles[stream]
assert isinstance(subtitle_stream.time_base, Fraction)
except Exception as e:
self.log.error(e)

# stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
def cleanhtml(raw_html: str) -> str:
cleanr = re.compile("<.*?>")
return re.sub(cleanr, "", raw_html)
# Get the length of the subtitle stream.
sub_length = 0
for packet in container.demux(subtitle_stream):
for subset in packet.decode():
if packet.pts is None or packet.duration is None:
continue
# See definition of `AVSubtitle`
# in: https://ffmpeg.org/doxygen/trunk/avcodec_8h_source.html
start = float(packet.pts * subtitle_stream.time_base)
dur = float(packet.duration * subtitle_stream.time_base)

if not parser.contents:
self.log.error("subtitle has no valid entries")
end = round((start + dur) * self.tb)
sub_length = max(sub_length, end)

result = np.zeros((parser.contents[-1].end), dtype=np.bool_)
result = np.zeros((sub_length), dtype=np.bool_)
del sub_length

count = 0
for content in parser.contents:
if max_count is not None and count >= max_count:
early_exit = False
container.seek(0)
for packet in container.demux(subtitle_stream):
if early_exit:
break

line = cleanhtml(content.after.strip())
if line and re.search(pattern, line):
result[content.start : content.end] = 1
count += 1
for subset in packet.decode():
if packet.pts is None or packet.duration is None:
continue
if max_count is not None and count >= max_count:
early_exit = True
break

start = float(packet.pts * subtitle_stream.time_base)
dur = float(packet.duration * subtitle_stream.time_base)

san_start = round(start * self.tb)
san_end = round((start + dur) * self.tb)

for sub in subset:
if sub.type == b"ass":
line = convert_ass_to_text(sub.ass.decode(errors="ignore"))
elif sub.type == b"text":
line = sub.text.decode(errors="ignore")
else:
continue

if line and re.search(pattern, line):
result[san_start:san_end] = 1
count += 1

container.close()

return result

Expand Down
2 changes: 2 additions & 0 deletions auto_editor/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def audio(self, src: FileInfo, stream: int) -> str:

if first_time:
self.log.conwrite("Extracting audio")
self.log.debug(f"Making external audio for stream: {stream}")

cmd = ["-i", f"{src.path}", "-map", f"0:a:{stream}"]
cmd += ["-ac", "2", "-ar", f"{self._sr}", "-rf64", "always", out_path]
Expand All @@ -52,6 +53,7 @@ def subtitle(self, src: FileInfo, stream: int) -> str:

if first_time:
self.log.conwrite("Extracting subtitle")
self.log.debug(f"Making external subtitle: {out_path}")
self._ffmpeg.run(["-i", f"{src.path}", "-map", f"0:s:{stream}", out_path])

return out_path
Expand Down
29 changes: 29 additions & 0 deletions auto_editor/utils/subtitle_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
def convert_ass_to_text(ass_text: str) -> str:
result = ""
comma_count = i = 0

while comma_count < 8 and i < len(ass_text):
if ass_text[i] == ",":
comma_count += 1
i += 1

state = False
while i < len(ass_text):
char = ass_text[i]
next_char = "" if i + 1 >= len(ass_text) else ass_text[i + 1]

if char == "\\" and next_char == "N":
result += "\n"
i += 2
continue

if not state:
if char == "{":
state = True
else:
result += ass_text[i]
elif char == "}":
state = False
i += 1

return result

0 comments on commit 758b215

Please sign in to comment.