import codecs import math import os import re class dfxp_to_srt: def __init__(self): self.__replace__ = "empty_line" def leading_zeros(self, value, digits=2): value = "000000" + str(value) return value[-digits:] def convert_time(self, raw_time): if int(raw_time) == 0: return "{}:{}:{},{}".format(0, 0, 0, 0) ms = "000" if len(raw_time) > 4: ms = self.leading_zeros(int(raw_time[:-4]) % 1000, 3) time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0 second = self.leading_zeros(time_in_seconds % 60) minute = self.leading_zeros(int(math.floor(time_in_seconds / 60)) % 60) hour = self.leading_zeros(int(math.floor(time_in_seconds / 3600))) return "{}:{}:{},{}".format(hour, minute, second, ms) def xml_id_display_align_before(self, text): align_before_re = re.compile( u'' ) has_align_before = re.search(align_before_re, text) if has_align_before: return has_align_before.group(1) return u"" def xml_to_srt(self, text): def append_subs(start, end, prev_content, format_time): subs.append( { "start_time": self.convert_time(start) if format_time else start, "end_time": self.convert_time(end) if format_time else end, "content": u"\n".join(prev_content), } ) display_align_before = self.xml_id_display_align_before(text) begin_re = re.compile(u"\s*

(.*)

') # span tags are only used for italics, so we'll get rid of them # and replace them by and , which is the standard for .srt files span_start_re = re.compile(u'()+') span_end_re = re.compile(u"()+") br_re = re.compile(u"()+") fmt_t = True for s in sub_lines: span_start_tags = re.search(span_start_re, s) if span_start_tags: s = u"".join(s.split(span_start_tags.group())) string_region_re = ( r'(.*)

' ) s = re.sub(string_region_re, r"{\\an8}\2

", s) content = re.search(content_re, s).group(1) br_tags = re.search(br_re, content) if br_tags: content = u"\n".join(content.split(br_tags.group())) span_end_tags = re.search(span_end_re, content) if span_end_tags: content = u"
".join(content.split(span_end_tags.group())) prev_start = prev_time["start"] start = re.search(start_re, s).group(1) end = re.search(end_re, s).group(1) if len(start.split(":")) > 1: fmt_t = False start = start.replace(".", ",") end = end.replace(".", ",") if (prev_start == start and prev_time["end"] == end) or not prev_start: # Fix for multiple lines starting at the same time prev_time = {"start": start, "end": end} prev_content.append(content) continue append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t) prev_time = {"start": start, "end": end} prev_content = [content] append_subs(start, end, prev_content, fmt_t) lines = ( u"{}\n{} --> {}\n{}\n".format( s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"] ) for s in range(len(subs)) ) return u"\n".join(lines) def convert(self, Input, Output): with codecs.open(Input, "rb", "utf-8") as f: text = f.read() with codecs.open(Output, "wb", "utf-8") as f: f.write(self.xml_to_srt(text)) return