-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
175 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,3 +19,5 @@ | |
|
||
# Go workspace file | ||
go.work | ||
|
||
bin/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
build: | ||
go build -o bin/vcat cmd/main.go |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package main | ||
|
||
import ( | ||
"flag" | ||
"fmt" | ||
"os" | ||
|
||
"github.com/hum/vcat" | ||
) | ||
|
||
var ( | ||
videoURL string | ||
) | ||
|
||
func main() { | ||
flag.StringVar(&videoURL, "url", "", "url to the video to get transcription from") | ||
flag.StringVar(&videoURL, "u", "", "url to the video to get transcription from") | ||
flag.Parse() | ||
|
||
if videoURL == "" { | ||
flag.Usage() | ||
os.Exit(1) | ||
} | ||
|
||
captions, err := vcat.GetCaptions(videoURL) | ||
if err != nil { | ||
panic(err) | ||
} | ||
transcript, err := vcat.GetTranscript(captions.PlayerCaptionsTracklistRenderer.CaptionTracks[0].BaseUrl) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
fmt.Println(vcat.StringIdentStruct(transcript)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
module github.com/hum/vcat | ||
|
||
go 1.21.5 | ||
|
||
require github.com/mitchellh/mapstructure v1.5.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= | ||
github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package vcat | ||
|
||
import ( | ||
"encoding/json" | ||
"strings" | ||
|
||
"github.com/mitchellh/mapstructure" | ||
) | ||
|
||
// GetCaptionsFromRawHtml is an ad-hoc function to parse the provided HTML body | ||
// into a struct, which represents the available captions for a given YouTube video. | ||
// | ||
// @TODO: This is a very naive function which does not handle any edge-cases. | ||
func GetCaptionsFromRawHtml(b []byte) (Captions, error) { | ||
dataStr := string(b) | ||
|
||
// Magic which will most likely break in the future. | ||
// Unfortunately the response is in HTML, so we parse it as a string | ||
// and only load the necessary parts as a valid JSON. | ||
parts := strings.Split(dataStr, "\"captions\":") | ||
parts = strings.Split(parts[1], ",\"videoDetails\"") | ||
|
||
var jsonString map[string]interface{} | ||
json.Unmarshal([]byte(parts[0]), &jsonString) | ||
|
||
var captions Captions | ||
err := mapstructure.Decode(jsonString, &captions) | ||
return captions, err | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package vcat | ||
|
||
import ( | ||
"io" | ||
"net/http" | ||
) | ||
|
||
// GetBodyAsByteSlice makes an HTTP request to the provided URL | ||
// and returns the response's body as a byte slice. | ||
func GetBodyAsByteSlice(url string) ([]byte, error) { | ||
r, err := http.Get(url) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
body, err := io.ReadAll(r.Body) | ||
return body, err | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package vcat | ||
|
||
import ( | ||
"encoding/json" | ||
) | ||
|
||
// StringIdentStruct helps format a struct into a pretty JSON. Uses tabs. | ||
func StringIdentStruct(strct any) string { | ||
s, _ := json.MarshalIndent(strct, "", "\t") | ||
return string(s) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package vcat | ||
|
||
import "encoding/xml" | ||
|
||
// Captions represent the main structure | ||
// which holds all of the necessary data to retrieve the actual captions | ||
type Captions struct { | ||
PlayerCaptionsTracklistRenderer struct { | ||
CaptionTracks []struct { | ||
BaseUrl string `json:"baseUrl"` | ||
Name struct { | ||
SimpleText string `json:"simpleText"` | ||
} `json:"name"` | ||
LanguageCode string `json:"languageCode"` | ||
Kind string `json:"asr"` | ||
IsTranslatable bool `json:"isTranslatable"` | ||
} `json:"captionTracks"` | ||
|
||
TranslationLanguages []struct { | ||
LanguageCode string | ||
LanguageName struct { | ||
SimpleText string `json:"simpleText"` | ||
} `json:"languageName"` | ||
} `json:"translationLanguages"` | ||
} `json:"playerCaptionsTracklistRenderer"` | ||
} | ||
|
||
type Transcript struct { | ||
XMLName xml.Name `xml:"transcript" json:"-"` | ||
Text []struct { | ||
XMLName xml.Name `xml:"text" json:"-"` | ||
Start string `xml:"start,attr"` | ||
Duration string `xml:"dur,attr"` | ||
Context string `xml:",innerxml"` | ||
} `xml:"text"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package vcat | ||
|
||
// GetCaptions is the entrypoint to fetching video captions for a YouTube video. | ||
// | ||
// @TODO: Handle edge-cases. | ||
func GetCaptions(url string) (Captions, error) { | ||
b, err := GetBodyAsByteSlice(url) | ||
if err != nil { | ||
return Captions{}, err | ||
} | ||
return GetCaptionsFromRawHtml(b) | ||
} | ||
|
||
// GetTranscript is the entrypoint for fetching the actual transcript for a YouTube video. | ||
func GetTranscript(url string) (Transcript, error) { | ||
b, err := GetBodyAsByteSlice(url) | ||
if err != nil { | ||
return Transcript{}, nil | ||
} | ||
return ParseTranscriptFromXml(b) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package vcat | ||
|
||
import "encoding/xml" | ||
|
||
// ParseTranscriptFromXml takes the raw body of the response and turns it into a | ||
// valid struct representation of the transcript. | ||
func ParseTranscriptFromXml(b []byte) (Transcript, error) { | ||
var transcript Transcript | ||
err := xml.Unmarshal(b, &transcript) | ||
return transcript, err | ||
} |