Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[YouTube] Add support for extracting auto-translated captions #997

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,7 @@ private void loadSubtitles() {
.setMediaFormat(fmt)
.setLanguageCode(languageCode)
.setAutoGenerated(false)
.setAutoTranslated(false)
.build());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -665,40 +665,77 @@ public List<SubtitlesStream> getSubtitlesDefault() throws ParsingException {

@Override
@Nonnull
public List<SubtitlesStream> getSubtitles(final MediaFormat format) throws ParsingException {
public List<SubtitlesStream> getSubtitles(@Nonnull final MediaFormat format)
throws ParsingException {
assertPageFetched();

// We cannot store the subtitles list because the media format may change
final List<SubtitlesStream> subtitlesToReturn = new ArrayList<>();
final List<SubtitlesStream> subtitles = new ArrayList<>();
final List<SubtitlesStream> autoTranslatedSubtitles = new ArrayList<>();
final JsonObject renderer = playerResponse.getObject("captions")
.getObject("playerCaptionsTracklistRenderer");
final JsonArray captionsArray = renderer.getArray("captionTracks");
// TODO: use this to apply auto translation to different language from a source language
// final JsonArray autoCaptionsArray = renderer.getArray("translationLanguages");

// Generate list of languages available for auto-translations
final List<String> translationLanguages;
if (renderer.has("translationLanguages")) {
translationLanguages = renderer.getArray("translationLanguages")
.stream()
.map(JsonObject.class::cast)
.map(lang -> lang.getString("languageCode"))
.collect(Collectors.toList());
} else {
translationLanguages = Collections.emptyList();
}

// Add subtitles
for (int i = 0; i < captionsArray.size(); i++) {
final String languageCode = captionsArray.getObject(i).getString("languageCode");
final String baseUrl = captionsArray.getObject(i).getString("baseUrl");
final String vssId = captionsArray.getObject(i).getString("vssId");

if (languageCode != null && baseUrl != null && vssId != null) {
final boolean isAutoGenerated = vssId.startsWith("a.");
final String cleanUrl = baseUrl
// Remove preexisting format if exists
.replaceAll("&fmt=[^&]*", "")
// Remove translation language
.replaceAll("&tlang=[^&]*", "");

subtitlesToReturn.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix(), true)
.setMediaFormat(format)
.setLanguageCode(languageCode)
.setAutoGenerated(isAutoGenerated)
.build());
final JsonObject caption = captionsArray.getObject(i);
final String languageCode = caption.getString("languageCode");
final String baseUrl = caption.getString("baseUrl");
final String vssId = caption.getString("vssId");

if (languageCode == null || baseUrl == null || vssId == null) {
continue;
}

TobiGr marked this conversation as resolved.
Show resolved Hide resolved
final boolean isAutoGenerated = vssId.startsWith("a.");
final String cleanUrl = baseUrl
// Remove preexisting format if exists
.replaceAll("&fmt=[^&]*", "")
// Remove translation language
.replaceAll("&tlang=[^&]*", "");

// add base subtitles
subtitles.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix(), true)
.setMediaFormat(format)
.setLanguageCode(languageCode)
.setAutoGenerated(isAutoGenerated)
.setAutoTranslated(false)
.build());

// add auto-translations of this subtitle if available
if (caption.getBoolean("isTranslatable")) {
for (final String tLanguageCode : translationLanguages) {
autoTranslatedSubtitles.add(new SubtitlesStream.Builder()
.setContent(cleanUrl + "&fmt=" + format.getSuffix()
+ "&tlang=" + tLanguageCode, true)
.setMediaFormat(format)
.setLanguageCode(tLanguageCode)
.setAutoGenerated(true)
.setAutoTranslated(true)
.setBaseLanguageCode(languageCode)
.build());
}
}

}

return subtitlesToReturn;
// add auto-translations at the end for better sorting
subtitles.addAll(autoTranslatedSubtitles);

return subtitles;
}

TobiGr marked this conversation as resolved.
Show resolved Hide resolved
@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@

public final class SubtitlesStream extends Stream {
private final MediaFormat format;
@Nullable
private final Locale baseLocale;
private final Locale locale;
private final boolean autoGenerated;
private final boolean autoTranslated;
private final String code;

/**
Expand All @@ -30,8 +33,11 @@ public static final class Builder {
@Nullable
private String manifestUrl;
private String languageCode;
@Nullable
private String baseLanguageCode;
// Use of the Boolean class instead of the primitive type needed for setter call check
private Boolean autoGenerated;
private Boolean autoTranslated;

/**
* Create a new {@link Builder} instance with default values.
Expand Down Expand Up @@ -140,6 +146,18 @@ public Builder setLanguageCode(@Nonnull final String languageCode) {
return this;
}

/**
* Set the language code of the base language used to auto-translate
* the {@link SubtitlesStream} to the current language code.
*
* @param baseLanguageCode the language code of the {@link SubtitlesStream}
* @return this {@link Builder} instance
*/
public Builder setBaseLanguageCode(@Nullable final String baseLanguageCode) {
this.baseLanguageCode = baseLanguageCode;
return this;
}

/**
* Set whether the subtitles have been auto-generated by the streaming service.
*
Expand All @@ -152,6 +170,18 @@ public Builder setAutoGenerated(final boolean autoGenerated) {
return this;
}

/**
* Set whether the subtitles have been automatically translated
* (i.e. by a machine like Google Translator) by the streaming service.
* @param autoTranslated whether the subtitles have been automatically translated by the
* streaming service
* @return this {@link Builder} instance
*/
public Builder setAutoTranslated(final boolean autoTranslated) {
this.autoTranslated = autoTranslated;
return this;
}

/**
* Build a {@link SubtitlesStream} using the builder's current values.
*
Expand Down Expand Up @@ -196,31 +226,41 @@ public SubtitlesStream build() throws ParsingException {
+ "with setIsAutoGenerated.");
}

if (autoTranslated == null) {
throw new IllegalStateException("The subtitles stream has been not set as an "
+ "automatically translated subtitles stream or not. "
+ "Please specify this information with setIsAutoTranslated.");
}

if (id == null) {
id = languageCode + (mediaFormat != null ? "." + mediaFormat.suffix
: "");
}

return new SubtitlesStream(id, content, isUrl, mediaFormat, deliveryMethod,
languageCode, autoGenerated, manifestUrl);
languageCode, autoGenerated, autoTranslated, baseLanguageCode, manifestUrl);
}
}

/**
* Create a new subtitles stream.
*
* @param id the identifier which uniquely identifies the stream, e.g. for YouTube
* this would be the itag
* @param content the content or the URL of the stream, depending on whether isUrl is
* true
* @param isUrl whether content is the URL or the actual content of e.g. a DASH
* manifest
* @param mediaFormat the {@link MediaFormat} used by the stream
* @param deliveryMethod the {@link DeliveryMethod} of the stream
* @param languageCode the language code of the stream
* @param autoGenerated whether the subtitles are auto-generated by the streaming service
* @param manifestUrl the URL of the manifest this stream comes from (if applicable,
* otherwise null)
* @param id the identifier which uniquely identifies the stream, e.g. for YouTube
* this would be the itag
* @param content the content or the URL of the stream, depending on whether isUrl is
* true
* @param isUrl whether content is the URL or the actual content of e.g. a DASH
* manifest
* @param mediaFormat the {@link MediaFormat} used by the stream
* @param deliveryMethod the {@link DeliveryMethod} of the stream
* @param languageCode the language code of the stream
* @param autoGenerated whether the subtitles are auto-generated by the streaming service
* @param autoTranslated whether the subtitles are auto-translated by the streaming service
* @param baseLanguageCode the language code of the base language used to translate
* the subtitles to the current language
* or null if the subtitles are not auto-translated
* @param manifestUrl the URL of the manifest this stream comes from (if applicable,
* otherwise null)
*/
@SuppressWarnings("checkstyle:ParameterNumber")
private SubtitlesStream(@Nonnull final String id,
Expand All @@ -230,6 +270,8 @@ private SubtitlesStream(@Nonnull final String id,
@Nonnull final DeliveryMethod deliveryMethod,
@Nonnull final String languageCode,
final boolean autoGenerated,
final boolean autoTranslated,
@Nullable final String baseLanguageCode,
@Nullable final String manifestUrl) throws ParsingException {
super(id, content, isUrl, mediaFormat, deliveryMethod, manifestUrl);
this.locale = LocaleCompat.forLanguageTag(languageCode).orElseThrow(
Expand All @@ -238,6 +280,14 @@ private SubtitlesStream(@Nonnull final String id,
this.code = languageCode;
this.format = mediaFormat;
this.autoGenerated = autoGenerated;
this.autoTranslated = autoTranslated;
if (baseLanguageCode == null) {
this.baseLocale = null;
} else {
this.baseLocale = LocaleCompat.forLanguageTag(baseLanguageCode).orElseThrow(
() -> new ParsingException(
"not a valid locale language code: " + baseLanguageCode));
}
}

/**
Expand All @@ -250,7 +300,7 @@ public String getExtension() {
}

/**
* Return whether if the subtitles are auto-generated.
* Return whether the subtitles are auto-generated.
* <p>
* Some streaming services can generate subtitles for their contents, like YouTube.
* </p>
Expand All @@ -261,6 +311,21 @@ public boolean isAutoGenerated() {
return autoGenerated;
}

/**
* Whether the subtitles are translated automatically by a machine.
*
* <p>
* Some streaming services provide automatically translated subtitles.
* YouTube, for example, uses Google translator to generate translated subtitles.
* Automatically translated subtitles might not coincide completely with the original text.
* </p>
*
* @return {code true} if the subtitles are auto-translated, {@link false} otherwise
*/
public boolean isAutoTranslated() {
return autoTranslated;
}

/**
* {@inheritDoc}
*/
Expand Down Expand Up @@ -299,6 +364,37 @@ public Locale getLocale() {
return locale;
}

/**
* Get the {@link Locale baseLocale} which was used to automatically translated the subtitles
* into the current {@link #locale}.
*
* @return the {@link Locale baseLocale} for the subtitle translation
* or {@code null} if the subtitle is not auto-translated
*/
@Nullable
public Locale getBaseLocale() {
return baseLocale;
}

/**
* Get the display base language name of the subtitles.
*
* @return the display language name of the subtitles
*/
public String getDisplayBaseLanguageName() {
return locale.getDisplayName(locale);
}

/**
* Get the language tag of the subtitles.
*
* @return the language tag of the subtitles
*/
public String getBaseLanguageTag() {
return code;
}


/**
* No subtitles which are currently extracted use an {@link ItagItem}, so {@code null} is
* returned by this method.
Expand All @@ -310,4 +406,16 @@ public Locale getLocale() {
public ItagItem getItagItem() {
return null;
}

@Override
public String toString() {
return "SubtitlesStream{"
+ "format=" + format
+ ", baseLocale=" + baseLocale
+ ", locale=" + locale
+ ", autoGenerated=" + autoGenerated
+ ", autoTranslated=" + autoTranslated
+ ", code='" + code + '\''
+ '}';
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.schabi.newpipe.extractor.utils;

import javax.annotation.Nonnull;
import java.util.Locale;
import java.util.Optional;

Expand All @@ -16,7 +17,7 @@ private LocaleCompat() {

// Source: The AndroidX LocaleListCompat class's private forLanguageTagCompat() method.
// Use Locale.forLanguageTag() on Android API level >= 21 / Java instead.
public static Optional<Locale> forLanguageTag(final String str) {
public static Optional<Locale> forLanguageTag(@Nonnull final String str) {
if (str.contains("-")) {
final String[] args = str.split("-", -1);
if (args.length > 2) {
Expand Down
Loading
Loading