#!/usr/bin/python

# Copyright 2007 Yannick Gingras <ygingras@ygingras.net>

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301 USA

from urllib import urlopen
from re import compile
from random import sample

# Some german texts from
#   http://www.gutenberg.org/browse/languages/fr

BOOKS = [ 15734, 14225, 13953, 14105, 6110, 19163, 19716, 2189, 16880,
  6724, 6641, 2190, 19673, 2889, 8803, 12267, 13690, 19596, 18232,
  18731, 16264, 9178, 19380, 19760, 12921, 10507, 18436, 18733, 21021,
  14075, 19653, 16302, 16301, 4504, 4505, 4503, 4501, 2380, 4502,
  18463, 16264, 8565, 8565, 5072, 5322, 20757, 2322, 17161, 14340,
  21680, 18101, 19778, 14223, 8085, 19460, 13732, 17454, 14997, 15213,
  19760, 12921, 18231, 20589, 11108, 12268, 11075, 18255, 8392, 6736,
  15711, 15995, 19530, 11306, 11306, 5323, 20211, 20613, 20944, 19971,
  14221, 9335, 9327, 18471, 14028, 19675, 18475, 20412, 14962, 15756,
  10428, 17657, 2402, 2146, 18101, 2229, 2230, 21000, 2321, 2406,
  2312, 2054, 2404, 2405, 17664, 10354, 2407, 2408, 5653, 10426, 2320,
  9260, 5325, 7875, 2228, 10353, 10425, 2420, 5326, 2403, 2319, 2335,
  2336, 2337, 2338, 2339, 2340, 2341, 2342, 2409, 2410, 2411, 7985,
  9181, 7943, 8961, 8964, 7944, 7946, 9045, 9046, 8999, 9049, 7945,
  8568, 9062, 7996, 9058, 9044, 7049, 16880, 9977, 16278, 16279,
  13452, 6726, 13451, 6638, 6639, 6640, 6892, 6890, 6891, 6725, 20302,
  4079, 4080, 2887, 4083, 3297, 7810, 20786, 6698, 6728, 6729, 6834,
  21349, 12927, 12273, 13805, 12113, 3498, 6079, 5607, 17622, 2499,
  13921, 9059, 7858, 9065, 9083, 9064, 9066, 9085, 9084, 9099, 9086,
  10914, 4013, 17362, 9200, 6341, 15787, 4601, 3299, 3298, 3213, 3212,
  3223, 3222, 3215, 3214, 3221, 3220, 3209, 3208, 3211, 3210, 3217,
  3216, 3219, 3218, 3225, 3224, 3299, 3298, 3213, 3212, 3223, 3222,
  3215, 3214, 3221, 3220, 3209, 3208, 3211, 3210, 3217, 3216, 3219,
  3218, 3225, 3224, 17169, 21801, 16304, 21593, 6342, 6343, 16718,
  6696, 20313, 6645, 6646, 6648, 6723, 6647, 21658, 20211, 21349,
  6819, 6821, 6830, 6832, 6833, 6835, 9950, 9375, 6820, 9108, 9160,
  9158, 9325, 9326, 10055, 9110, 9369, 6889, 6822, 9187, 9157, 9186,
  6831, 9159, 20965, 8298, 8375, 21658, 14700, 15068, 21115, 19530,
  12053, 12108, 13810, 15028, 2779, 13657, 13658, 13659, 607, 607,
  18620, 6421, 9495, 5801, 9496, 9632, 9494, 3675, 15813, 3060, 3061,
  3062, 3063, 3064, 3065, 15898, 9623, 7503, 15505, 13661, 4371, 4372,
  4373, 4374, 4375, 7205, 7202, 7203, 7206, 7204, 7207, 17379, 17383,
  17379, 17383, 15267, 15267, 6637, 6644, 7859, 6642, 6643, 7861,
  7860, 6654, 20937, 21031, 17153, 2188, 12012, 16280, 14142, 15890,
  17599, 17600, 2174, 17130, 10917, 19733, 12075, 11677, 16264, 21658,
  21349, 6496, 6649, 7939, 6383, 6498, 6503, 6504, 6525, 6505, 6499,
  6518, 6549, 4601, 9875, 7043, 6924, 6996, 7022, 7041, 15070, 20637,
  18436, 18148, 17142, 10823, 11925, 15711, 15995, 21527, 6990, 7276,
  7225, 9875, 7933, 7934, 7043, 7240, 7292, 7323, 7233, 6975, 7269,
  7185, 6924, 6996, 7232, 7022, 7264, 7236, 7226, 7186, 7041, 9491,
  6004, 9810, 9802, 16264, 7512, 7500, 7511, 20780, 9860, 9859, 9861,
  7888, 8915, 14330, 7068, 8126, 16278, 16279, 8889, 8927, 8895, 8916,
  6651, 20977, 8926, 8919, 8923, 8925, 8922, 8917, 8921, 19778, 12636,
  21053, 6990, 6975, 9491, 9810, 9802, 14915, 21658, 10223, 15891,
  15559, 18258, 19611, 17007, 15736, 17413, 20413, 17143, 15952,
  21535, 19940, 18551, 18552, 12266, 2313, 2314, 7276, 7225, 7933,
  7934, 7240, 7292, 7323, 7233, 7269, 2187, 7185, 7232, 7264, 7236,
  7226, 7186, 12660, 16264, 16264, 19823, 2947, 19778 ]

  

BASE_URL = "http://www.gutenberg.org/"
MAIN_URL = BASE_URL + "etext/%d"
DL_PAT = compile(r'<td class="pgdbfilesdownload"><a href="(/(?:dirs|files)/.*?.txt)"')
MAX = 2**22

for book in sample(BOOKS, 50):
    try:
        data = urlopen(MAIN_URL % book).read(MAX)
        print book, 
        dl = DL_PAT.findall(data)[0]
        print dl
        stream = urlopen(BASE_URL + dl)
        open("%04d.txt" % book, "w").write(stream.read(MAX))
    except:
        print "Error"
    
