import re import math def leading_zeros(value, digits=2): value = "000000" + str(value) return value[-digits:] def convert_time(raw_time): if int(raw_time) == 0: return "{}:{}:{},{}".format(0, 0, 0, 0) ms = '000' if len(raw_time) > 4: ms = leading_zeros(int(raw_time[:-4]) % 1000, 3) time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0 second = leading_zeros(time_in_seconds % 60) minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60) hour = leading_zeros(int(math.floor(time_in_seconds / 3600))) return "{}:{}:{},{}".format(hour, minute, second, ms) def to_srt(text): def append_subs(start, end, prev_content, format_time): subs.append({ "start_time": convert_time(start) if format_time else start, "end_time": convert_time(end) if format_time else end, "content": u"\n".join(prev_content), }) begin_re = re.compile(u"\s*

(.*)

') content_re = re.compile(u'\">(.*)

') alt_content_re = re.compile(u'(.*?)') br_re = re.compile(u'()+') fmt_t = True for s in sub_lines: content = [] alt_content = re.search(alt_content_re, s) while (alt_content): # background text may have additional styling. # background may also contain several ` ` groups s = s.replace(alt_content.group(0), alt_content.group(1)) alt_content = re.search(alt_content_re, s) content = re.search(content_re, s).group(1) br_tags = re.search(br_re, content) if br_tags: content = u"\n".join(content.split(br_tags.group())) prev_start = prev_time["start"] start = re.search(start_re, s).group(1) end = re.search(end_re, s).group(1) if len(start.split(":")) > 1: fmt_t = False start = start.replace(".", ",") end = end.replace(".", ",") if (prev_start == start and prev_time["end"] == end) or not prev_start: # Fix for multiple lines starting at the same time prev_time = {"start": start, "end": end} prev_content.append(content) continue append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t) prev_time = {"start": start, "end": end} prev_content = [content] append_subs(start, end, prev_content, fmt_t) lines = (u"{}\n{} --> {}\n{}\n".format( s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"]) for s in range(len(subs))) return u"\n".join(lines)