Tomas Salfischberger | e1b0427 | 2005-06-08 21:01:26 +0000 | [diff] [blame] | 1 | #!/usr/bin/perl |
| 2 | |
| 3 | # __________ __ ___. |
| 4 | # Open \______ \ ____ ____ | | _\_ |__ _______ ___ |
| 5 | # Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / |
| 6 | # Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < |
| 7 | # Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ |
| 8 | # \/ \/ \/ \/ \/ |
| 9 | # $Id$ |
| 10 | # |
| 11 | # Copyright (C) 2005 Tony Motakis |
| 12 | # |
| 13 | # All files in this archive are subject to the GNU General Public License. |
| 14 | # See the file COPYING in the source tree root for full license agreement. |
| 15 | # |
| 16 | # This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY |
| 17 | # KIND, either express or implied. |
| 18 | |
| 19 | # set the word size limit |
| 20 | $word_limit = 32; |
| 21 | |
| 22 | use Compress::Zlib; |
| 23 | |
| 24 | # generate base 64 convertion hash |
| 25 | @b64_values = ( 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
| 26 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
| 27 | 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', |
| 28 | 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', |
| 29 | 'w', 'x', 'y', 'z', '0', 1, 2, 3, 4, 5, 6, 7, 8, 9, '+', '/' ); |
| 30 | |
| 31 | foreach (0..63) { |
| 32 | $b64_get_value{$b64_values[$_]} = $_; |
| 33 | } |
| 34 | |
| 35 | # base 64 convertion subroutine. note that if input is plain (base 64) 0, perl |
| 36 | # doesn't like it, and the function misinterprents it as a (decimal) 0 |
| 37 | # while it actually is a (decimal) 52. Input has a tab in front anyway, so |
| 38 | # this bug actually doesn't matter |
| 39 | sub base64 { |
| 40 | my $i = 1, $num = 0, $left = $_[0]; |
| 41 | while($left) { |
| 42 | $left =~ m{([^\s])$}; # use last char of string |
| 43 | chop $left; # yes, chop, NOT chomp |
| 44 | $num += $i * $b64_get_value{$1}; |
| 45 | $i *= 64; |
| 46 | } |
| 47 | $num; |
| 48 | } |
| 49 | |
| 50 | # Open input files. <INDEX> is the database index, and $DICT is the actuall |
| 51 | # dictionary file we want to access (note the use of zlib, hence the $DICT |
| 52 | # variable instead of a <DICT> filehandle). <RDFOUT> is the output file, in |
| 53 | # plain rockbox dictionary format |
| 54 | open INDEX, $ARGV[0] or die "Could not open index: $!"; |
| 55 | $DICT = gzopen($ARGV[1], "rb") or die "Could not open definitions file: $!"; |
| 56 | open RDFOUT, ">$ARGV[2]" or die "Could not open output file: $!"; |
| 57 | |
| 58 | # Read the index |
| 59 | while(<INDEX>) |
| 60 | { |
| 61 | next if /^00-?database/; |
| 62 | |
| 63 | my @current = split /\t|\n/; # split in pieces |
| 64 | $current[0] =~ s/^\s(.{1,$word_limit}).*$/\L\1/; # lowercase |
| 65 | push @def_list, $current[0]; |
| 66 | $def_begin{$current[0]} = base64($current[1]); |
| 67 | $def_length{$current[0]} = base64($current[2]); |
| 68 | } |
| 69 | |
| 70 | # sort the definition list. input from the <INDEX> is usualy sorted, but this |
| 71 | # is not mandatory in the dict file format, so we can't rely on this |
| 72 | @def_list = sort @def_list; |
| 73 | |
| 74 | # read the whole DICT file into memory. overkill? propably. but the file is |
| 75 | # compressed, and we need quick access to random parts of it |
| 76 | $def_all .= $_ while($DICT->gzread($_)); |
| 77 | |
| 78 | foreach (@def_list) { |
| 79 | $def = substr $def_all, $def_begin{$_}, $def_length{$_}; |
| 80 | $def =~ s/\n\s*/ /g; # remove newlines and whitespace after them |
| 81 | print RDFOUT $_ . "\t" . $def . "\n"; |
| 82 | } |
| 83 | |