If you get access to subtitles, you may find that they repeat overlapping text, which is a real pain if you just want a transcript. The repeated text is pretty valuable in the newer subtitle formats like WebVTT, as you can highlight words as they are being spoken, but for transcript processing it’s not helpful.
3309 00:51:04,309 --> 00:51:09,150 you know much of the talk so thank you very much 3310 00:51:09,150 --> 00:51:09,550 so thank you very much 3311 00:51:09,550 --> 00:51:12,570 so thank you very much for more information please 3312 00:51:12,570 --> 00:51:12,970 for more information please 3313 00:51:12,970 --> 00:51:16,290 for more information please visit www.freddyandeddy.com see 3314 00:51:16,290 --> 00:51:16,690 visit www.freddyandeddy.com see 3315 00:51:16,690 --> 00:51:22,690 visit www.freddyandeddy.com see dots UK
The first step to handling this is to write code to parse the file – since it starts with a number, then a time window, and then text, it lends itself to a state machine (in case we run across a file that has a line of transcript that is numerical etc)
function process(lines) {
let line0 = /^\d+$/;
let line1 = /^\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+$/;
let states = ["line0", "line1", "text"]
let processors = [x => null, x => null, x => x]
let nexts = [x => !!x.match(line0), x => x.match(line1), x => x === '']
let transitions = [1, 2, 0]
let idx = 0;
let stateIdx = 0;
let result = [];
while (idx < lines.length) {
let line = lines[idx];
let thisLineResult = processors[stateIdx](line);
if (thisLineResult !== null && thisLineResult !== "") {
result.push(thisLineResult);
}
if (nexts[stateIdx](line)) {
stateIdx = transitions[stateIdx];
}
idx++;
}
let thisLineResult = processors[stateIdx](lines[idx-1]);
if (thisLineResult != null) {
result.push(thisLineResult);
}
return result;
}
Once you run this, you'll get a list of just the appropriate lines of text from the subtitles.
Clearly we need to get rid of the duplicate text - the next step is to define a fucntion that can detect the bits of text that are replicated from section to section, shown here:
function findOverlap(a, b) {
if (b.length === 0) {
return "";
}
if (a.endsWith(b)) {
return b;
}
if (a.indexOf(b) >= 0) {
return b;
}
return findOverlap(a, b.substring(0, b.length - 1));
}
Once we've done this, we can re-constitute the subtitle string, but without anything that overlaps. This function should join everything together (with spaces). The tricky thing here is to not consider overlapping whitespace as a problem, or a single character that just happens to overlap (the file word of a sentence and the first of the next start with "s", for example).
To address these issue I've arbitrarily chosen to make sure the overlap is five characters or more. When adding the space between the segments, it must also go at the beginning, since it is a new character and wouldn't show up in the overlap detection.
let textLines = process(subtitles.split("\n"));
function filterDuplicateText(lines) {
let idx = 0;
let text = lines[0];
while (idx < lines.length - 1) {
let overlap =
findOverlap(lines[idx], lines[idx + 1]);
if (overlap.length >= 5) {
let nonOverlap = textLines[idx + 1].substring(overlap.length);
if (nonOverlap.length > 0) {
text += ' ' + nonOverlap;
}
} else {
text += ' ' + textLines[idx + 1];
}
idx++;
}
return text;
}