#!/usr/bin/env python # vim: set ts=8 sw=4 sts=4 et ai tw=79: # What to parse. The first_data allows us to skip over the headers without # needing to understand any of them. #source, first_data = '/var/lib/aspell/nl.rws', '\x0e\x0e\x0aabacadabra\x00' source, first_data = '/var/lib/aspell/nl.rws', 'abracadabra\x00' # I attempted to run this on the sv.rws too, but that did not work out well.. # there are some complicating factors involved. Use this for the dutch list # only. #source = first_data = ('/var/lib/aspell/sv.rws', # '\x0e\x16\x0cschaktmassor\x00\x0e\x03&AL\x00') def aspellreader(file, skip_until): assert 1 <= len(skip_until) <= 4096 buf = '' # Skip past the header. while True: more = file.read(4096) if not more: return # fail! buf += more if len(buf) >= 8192: buf = buf[4096:] if skip_until in buf: break # Where? #index = buf.find(skip_until) #buf = buf[(index + len(skip_until)):] index = buf.find(skip_until) - 3 buf = buf[index:] # Start reading the words one at a time. eof = False while True: # Words don't exceed 100 chars.. this is safe-ish. if len(buf) < 100 and not eof: more = file.read(4096) if not more: eof = True else: buf += more if len(buf) < 4: break inf1, inf2, size = ord(buf[0:1]), ord(buf[1:2]), ord(buf[2:3]) if inf1 & 0b01000000: break # done? if len(buf) < 4 + size: break word = buf[3:(3 + size)] yield word, inf1, inf2 if buf[(3 + size)] == '\x00': buf = buf[(4 + size):] else: assert False, (repr(buf), repr(word), inf1, inf2) def normalwords(reader): for word, inf1, inf2 in reader: # iets met afwijkende stam... if (inf2 - 4) != len(word): continue if inf1 in (7, 23): # all-caps afkorting ("OZB") pass elif inf1 in (2, 6, 10, 18, 22): # koppeltekens en trema's pass elif inf1 in (0, 1, 4, 5, 17, 20, 21): # plaatsnaam, eigennaam ("Bibi") pass else: assert '-' not in word assert not any(i in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' for i in word) yield word, inf1, inf2 def limitedsizewords(reader, min, max): for word, inf1, inf2 in reader: if not (min <= len(word) <= max): continue yield word, inf1, inf2 dictionary = aspellreader(open(source, 'rb'), first_data) dictionary = normalwords(dictionary) dictionary = limitedsizewords(dictionary, 4, 7) try: for i, (word, inf1, inf2) in enumerate(dictionary): print word except StopIteration: pass