Project: FFmpeg2theora
Revision: 15046
Author: j
Date: 17 Jun 2008 07:08:24
Changes:ignoring non utf8 sequence in what is claimed to be utf8. (thanks ogg.k.ogg.k)
Files:modified: /trunk/ffmpeg2theora/src/subtitles.c (
try)
modified: /trunk/ffmpeg2theora/src/ffmpeg2theora.h (
try)
modified: /trunk/ffmpeg2theora/ffmpeg2theora.1 (
try)
modified: /trunk/ffmpeg2theora/src/subtitles.h (
try)
modified: /trunk/ffmpeg2theora/subtitles.txt (
try)
modified: /trunk/ffmpeg2theora (
try)
modified: /trunk/ffmpeg2theora/src/ffmpeg2theora.c (
try)
Diff:
| ... | ...@@ -112,14 +112,19 @@ |
| 112 | 112 | } |
| 113 | 113 | |
| 114 | | char *fgets2(char *s,size_t sz,FILE *f) |
| 114 | #ifdef HAVE_KATE |
| 115 | |
| 116 | static char *fgets2(char *s,size_t sz,FILE *f) |
| 115 | 117 | { |
| 116 | 118 | char *ret = fgets(s, sz, f); |
| 117 | 119 | /* fixup DOS newline character */ |
| 118 | 120 | char *ptr=strchr(s, '\r'); |
| 119 | | if (ptr) *ptr='\n'; |
| 121 | if (ptr) { |
| 122 | *ptr='\n'; |
| 123 | *(ptr+1)=0; |
| 124 | } |
| 120 | 125 | return ret; |
| 121 | 126 | } |
| 122 | 127 | |
| 123 | | double hmsms2s(int h,int m,int s,int ms) |
| 128 | static double hmsms2s(int h,int m,int s,int ms) |
| 124 | 129 | { |
| 125 | 130 | return h*3600+m*60+s+ms/1000.0; |
| ... | ...@@ -127,8 +132,9 @@ |
| 127 | 132 | |
| 128 | 133 | /* very simple implementation when no iconv */ |
| 129 | | void convert_subtitle_to_utf8(F2T_ENCODING encoding,unsigned char *text) |
| 134 | static void convert_subtitle_to_utf8(F2T_ENCODING encoding,unsigned char *text,int ignore_non_utf8) |
| 130 | 135 | { |
| 131 | 136 | size_t nbytes; |
| 132 | | unsigned char *ptr,*newtext; |
| 137 | char *ptr,*newtext; |
| 138 | int errors=0; |
| 133 | 139 | |
| 134 | 140 | if (!text || !*text) return; |
| ... | ...@@ -137,7 +143,48 @@ |
| 137 | 143 | case ENC_UNSET: |
| 138 | 144 | /* we don't know what encoding this is, assume utf-8 and we'll yell if it ain't */ |
| 139 | | break; |
| 145 | /* fall through */ |
| 140 | 146 | case ENC_UTF8: |
| 141 | 147 | /* nothing to do, already in utf-8 */ |
| 148 | if (ignore_non_utf8) { |
| 149 | /* actually, give the user the option of just ignoring non UTF8 characters */ |
| 150 | char *wptr; |
| 151 | size_t wlen0; |
| 152 | |
| 153 | nbytes = strlen(text)+1; |
| 154 | newtext=(unsigned char*)malloc(nbytes); |
| 155 | if (!newtext) { |
| 156 | fprintf(stderr, "WARNING - Memory allocation failed - cannot convert text\n"); |
| 157 | return; |
| 158 | } |
| 159 | ptr = text; |
| 160 | wptr = newtext; |
| 161 | wlen0 = nbytes; |
| 162 | while (nbytes>0) { |
| 163 | int ret=kate_text_get_character(kate_utf8, (const char ** const)&ptr, &nbytes); |
| 164 | if (ret>=0) { |
| 165 | /* valid character */ |
| 166 | ret=kate_text_set_character(kate_utf8, ret, &wptr, &wlen0); |
| 167 | if (ret<0) { |
| 168 | fprintf(stderr, "WARNING - failed to filter utf8 text: %s\n", text); |
| 169 | free(newtext); |
| 170 | return; |
| 171 | } |
| 172 | if (ret==0) break; |
| 173 | } |
| 174 | else { |
| 175 | /* skip offending byte - we can't skip the terminating zero as we do byte by byte */ |
| 176 | ++errors; |
| 177 | ++ptr; |
| 178 | --nbytes; |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | if (errors) { |
| 183 | fprintf(stderr, "WARNING - Found non utf8 character(s) in string %s, scrubbed out\n", text); |
| 184 | } |
| 185 | |
| 186 | strcpy(text,newtext); |
| 187 | free(newtext); |
| 188 | } |
| 142 | 189 | break; |
| 143 | 190 | case ENC_ISO_8859_1: |
| ... | ...@@ -151,5 +198,5 @@ |
| 151 | 198 | newtext=(unsigned char*)malloc(1+nbytes); |
| 152 | 199 | if (!newtext) { |
| 153 | | fprintf(stderr, "Memory allocation failed - cannot convert text\n"); |
| 200 | fprintf(stderr, "WARNING - Memory allocation failed - cannot convert text\n"); |
| 154 | 201 | return; |
| 155 | 202 | } |
| ... | ...@@ -174,5 +221,13 @@ |
| 174 | 221 | } |
| 175 | 222 | |
| 176 | | int load_subtitles(ff2theora_kate_stream *this) |
| 223 | static void remove_last_newline(char *text) |
| 224 | { |
| 225 | char *ptr = text+strlen(text)-1; |
| 226 | if (*ptr=='\n') *ptr=0; |
| 227 | } |
| 228 | |
| 229 | #endif |
| 230 | |
| 231 | int load_subtitles(ff2theora_kate_stream *this, int ignore_non_utf8) |
| 177 | 232 | { |
| 178 | 233 | #ifdef HAVE_KATE |
| ... | ...@@ -237,5 +292,9 @@ |
| 237 | 292 | case need_text: |
| 238 | 293 | if (*str=='\n') { |
| 239 | | convert_subtitle_to_utf8(this->subtitles_encoding,(unsigned char*)text); |
| 294 | /* we have all the lines for that subtitle, remove the last \n */ |
| 295 | remove_last_newline(text); |
| 296 | |
| 297 | /* we want all text to be UTF8 */ |
| 298 | convert_subtitle_to_utf8(this->subtitles_encoding,(unsigned char*)text,ignore_non_utf8); |
| 240 | 299 | size_t len = strlen(text); |
| 241 | 300 | this->subtitles = (ff2theora_subtitle*)realloc(this->subtitles, (this->num_subtitles+1)*sizeof(ff2theora_subtitle)); |
| ... | ...@@ -86,5 +86,6 @@ |
| 86 | 86 | size_t n_kate_streams; |
| 87 | 87 | ff2theora_kate_stream *kate_streams; |
| 88 | | |
| 88 | |
| 89 | int ignore_non_utf8; |
| 89 | 90 | // ffmpeg2theora --nosound -f dv -H 32000 -S 0 -v 8 -x 384 -y 288 -G 1.5 input.dv |
| 90 | 91 | double video_gamma; |
| ... | ...@@ -167,4 +167,9 @@ |
| 167 | 167 | "subtitles". Suggested other categories may include "transcript", |
| 168 | 168 | "commentary", "lyrics", etc. |
| 169 | .TP |
| 170 | .B \-\-subtitles-ignore-non-utf8 |
| 171 | When reading an utf-8 subtitles text file, any invalid utf-8 sequence |
| 172 | will be ignored. This may be useful if there are stray sequences in |
| 173 | an otherwise utf-8 file. |
| 169 | 174 | .SS Metadata options: |
| 170 | 175 | .TP |
| ... | ...@@ -17,5 +17,5 @@ |
| 17 | 17 | |
| 18 | 18 | extern void add_kate_stream(ff2theora this); |
| 19 | | extern int load_subtitles(ff2theora_kate_stream *this); |
| 19 | extern int load_subtitles(ff2theora_kate_stream *this, int ignore_non_utf8); |
| 20 | 20 | extern void free_subtitles(ff2theora this); |
| 21 | 21 | |
| ... | ...@@ -26,7 +26,4 @@ |
| 26 | 26 | extern void report_unknown_subtitle_encoding(const char *name); |
| 27 | 27 | |
| 28 | | extern char *fgets2(char *s,size_t sz,FILE *f); |
| 29 | | extern double hmsms2s(int h,int m,int s,int ms); |
| 30 | | extern void convert_subtitle_to_utf8(F2T_ENCODING encoding,unsigned char *text); |
| 31 | 28 | #endif |
| 32 | 29 | |
| ... | ...@@ -1,3 +1,3 @@ |
| 1 | | Subtitles can be embedded in an Ogg stream alongside a Theora video. |
| 1 | Text subtitles can be embedded in an Ogg stream alongside a Theora video. |
| 2 | 2 | |
| 3 | 3 | * Overview |
| ... | ...@@ -5,4 +5,5 @@ |
| 5 | 5 | * Converting non-utf-8 files to utf-8 |
| 6 | 6 | * Examples |
| 7 | * Playing subtitles |
| 7 | 8 | |
| 8 | 9 | |
| ... | ...@@ -61,4 +62,10 @@ |
| 61 | 62 | If unspecified, the default is utf-8. |
| 62 | 63 | |
| 64 | --subtitles-ignore-non-utf8 |
| 65 | Any invalid sequence in utf-8 text will be ignored. This may be useful |
| 66 | when using an utf-8 file with stray non utf-8 characters. This is not |
| 67 | a substitute for converting a non utf-8 file to utf-8, however, as the |
| 68 | non utf-8 sequence will be missing from the output stream. |
| 69 | |
| 63 | 70 | |
| 64 | 71 | |
| ... | ...@@ -130,2 +137,8 @@ |
| 130 | 137 | |
| 131 | 138 | |
| 139 | * Playing subtitles |
| 140 | |
| 141 | At the moment, only VLC has playback support for Kate streams. However, the |
| 142 | libkate distribution includes patches for other players and media frameworks |
| 143 | (MPlayer, GStreamer). |
| 144 | |
| ... | ...@@ -63,4 +63,5 @@ |
| 63 | 63 | SUBTITLES_LANGUAGE_FLAG, |
| 64 | 64 | SUBTITLES_CATEGORY_FLAG, |
| 65 | SUBTITLES_IGNORE_NON_UTF8_FLAG, |
| 65 | 66 | VHOOK_FLAG, |
| 66 | 67 | FRONTEND_FLAG, |
| ... | ...@@ -175,4 +176,5 @@ |
| 175 | 176 | this->n_kate_streams=0; |
| 176 | 177 | this->kate_streams=NULL; |
| 178 | this->ignore_non_utf8 = 0; |
| 177 | 179 | |
| 178 | 180 | this->pix_fmt = PIX_FMT_YUV420P; |
| ... | ...@@ -1183,4 +1185,5 @@ |
| 1183 | 1185 | " --subtitles-language language set subtitles language (de, en_GB, etc)\n" |
| 1184 | 1186 | " --subtitles-category category set subtitles category (default \"subtitles\")\n" |
| 1187 | " --subtitles-ignore-non-utf8 ignores any non utf-8 sequence in utf-8 text\n" |
| 1185 | 1188 | "\n" |
| 1186 | 1189 | #endif |
| ... | ...@@ -1279,4 +1282,5 @@ |
| 1279 | 1282 | {"subtitles",required_argument,&flag,SUBTITLES_FLAG}, |
| 1280 | 1283 | {"subtitles-encoding",required_argument,&flag,SUBTITLES_ENCODING_FLAG}, |
| 1284 | {"subtitles-ignore-non-utf8",0,&flag,SUBTITLES_IGNORE_NON_UTF8_FLAG}, |
| 1281 | 1285 | {"subtitles-language",required_argument,&flag,SUBTITLES_LANGUAGE_FLAG}, |
| 1282 | 1286 | {"subtitles-category",required_argument,&flag,SUBTITLES_CATEGORY_FLAG}, |
| ... | ...@@ -1408,4 +1412,8 @@ |
| 1408 | 1412 | flag = -1; |
| 1409 | 1413 | break; |
| 1414 | case SUBTITLES_IGNORE_NON_UTF8_FLAG: |
| 1415 | convert->ignore_non_utf8 = 1; |
| 1416 | flag = -1; |
| 1417 | break; |
| 1410 | 1418 | case SUBTITLES_LANGUAGE_FLAG: |
| 1411 | 1419 | if (strlen(optarg)>15) { |
| ... | ...@@ -1425,4 +1433,5 @@ |
| 1425 | 1433 | case SUBTITLES_FLAG: |
| 1426 | 1434 | case SUBTITLES_ENCODING_FLAG: |
| 1435 | case SUBTITLES_IGNORE_NON_UTF8_FLAG: |
| 1427 | 1436 | case SUBTITLES_LANGUAGE_FLAG: |
| 1428 | 1437 | case SUBTITLES_CATEGORY_FLAG: |
| ... | ...@@ -1684,5 +1693,5 @@ |
| 1684 | 1693 | for (n=0; n<convert->n_kate_streams; ++n) { |
| 1685 | 1694 | ff2theora_kate_stream *ks=convert->kate_streams+n; |
| 1686 | | if (load_subtitles(ks)>=0) { |
| 1695 | if (load_subtitles(ks,convert->ignore_non_utf8)>=0) { |
| 1687 | 1696 | printf("Muxing Kate stream %d from %s as %s %s\n", |
| 1688 | 1697 | n,ks->filename, |
To list