summaryrefslogtreecommitdiff
blob: 04e75449dbae24a26fffbfa1d3a357faf7f3f0a2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
# bundled_lib.py; implementation of a fetcher class useing httplib and ftplib.
# Copyright 2004 Gentoo Foundation
# Distributed under the terms of the GNU General Public License v2
#$Header: /local/data/ulm/cvs/history/var/cvsroot/gentoo-src/portage/portage/transports/bundled_lib.py,v 1.2 2005/07/13 05:51:35 ferringb Exp $

import httplib, ftplib, urlparse, base64, re, sys, os

class BundledConnection:
	"""a fetcher abstraction using httplib and ftplib.
	offers api access to specify the specific window/chunk of a uri you want"""
	def __init__(self, persistant=False,chunk_size=8192,verbose=True):
		self.__persistant = persistant
		self.__chunk_size = chunk_size
		self.__verbose = verbose

		# lifting the connection caching from check_src_uri's code- combine them if possible
		self._http_conn = {}
		self._https_conn = {}
		self._ftp_conn = {}

	def __get_connection(self, uri):
		"""internal function to raid from the instances cache of connections, or return a new one"""
		proto,host,con_dict,url,hash = self.__process_uri(uri)
		cons = getattr(self,"_%s_conn" % proto)
		if not self.__persistant or not cons.has_key(hash):
			if proto in ('http','https'):
				con = httpConnection(host,**con_dict)
			else:
				con = ftpConnection(host,chunk_size=self.__chunk_size,**con_dict)
			if self.__persistant:
				cons[hash] = con
		else:
			con = cons[hash]
		return proto, con, url

	def fetch(self,uri, file_name=None,verbose=None):
		"""fetch uri, storing it in file_name"""
		if verbose==None:
			verbose=self.__verbose
		proto, con, url = self.__get_connection(uri)
		if not file_name:
			x= url.find("/")
			if x == -1:
				raise Exception,"Unable to deterimine file_name from %s" % uri
			file_name = url[x+1:]
		try:
			myf=open(file_name,"w",0)

		except (IOError, OSError), e:
			sys.stderr.write("Failed opening file %s for saving:\n%s\n" % (file_name,str(e)))
			return True
		if verbose:
			print "fetching '%s' -> '%s'" % (uri, file_name)
		if proto in ('http','https'):
			try:
				ret= self.__execute_transfer(con, url, myf, 0)
			except UnableToAccess,ua:
				if verbose:
					print ua
				myf.close()
				os.remove(file_name)
				return True
		else:
			ret = con.request(url, callback=myf.write)
		myf.close()
		return not ret

	def resume(self, uri, file_name=None,verbose=None):
		"""resume uri into file_name"""
		if verbose==None:
			verbose=self.__verbose
		proto, con, url = self.__get_connection(uri)
		if not file_name:
			x= url.find("/")
			if x == -1:
				raise Exception,"Unable to deterimine file_name from %s" % uri
			file_name = url[x+1:]
		try:
			pos = os.stat(file_name).st_size

			# open it manually, since open(file_name,"a") has the lovely feature of 
			# ignoring _all_ previous seek requests the minute a write is issued.
			fd=os.open(file_name, os.O_WRONLY)
			myf = os.fdopen(fd,"w",0)
			if pos > self.__chunk_size:
				pos -= self.__chunk_size 
				myf.seek(pos, 0)
			else:
				pos=0

		except OSError, e:
			if e.errno == 2:
				# file not found
				pos = 0
				myf = open(file_name, "w",0)
			else:
				sys.stderr.write("Failed opening file %s for saving:\n%s\n" % (file_name,str(e)))
				return True

		if verbose:
			print "resuming '%s' -> '%s'" % (uri, file_name)
		if proto in ('http','https'):
			try:
				ret = self.__execute_transfer(con, url, myf, pos)
			except UnableToAccess,ua:
				if verbose:
					print ua
				myf.close()
				return True
		else:
			ret = con.request(url, callback=myf.write, start=pos)
		myf.close()
		return not ret

	def __execute_transfer(self, con, url, fd, pos):
		"""internal http(|s) function for looping over requests storing to fd"""
		rc=1
		redirect_max = 10
		while rc:
#			print "fetching %i-%i" % (pos, pos + (self.__chunk_size *8) -1)
			try:
				data,rc=con.request(url,start=pos, end=(pos+(self.__chunk_size*8) -1))
			except MovedLocation, ml:
				sys.stderr.write(str(ml)+"\n")
				url = ml.location
				redirect_max -= 1
				if redirect_max <= 0:
					print "we've been redirected too many times- bailing"
					return False
				else:
					continue
			except UnhandledError, ue:
				print ue
				return False				
			fd.write(data)
			pos += len(data)
		return True
		
	default_ports = {'http':80, 'https':443, 'ftp':21}

	def __process_uri(self, uri):
		"""internal function to determine the proto, host, uri, options for
		__get_connection, and a hash representing this host's specific options.
		username, password, port, ssl, etc"""
		con_dict = {}
		proto,parts,url = urlparse.urlparse(uri)[0:3]
		if not self.default_ports.has_key(proto):
#			port = self.default_ports[proto]
#		else:
			raise Exception, "unknown protocol %s for uri %s" % (proto,uri)

		parts = parts.split("@")
		if len(parts) > 1:
			con_dict["user"] = parts.pop(0)

		r=re.compile(":\d+$").search(parts[0])
		if r:
			# found a port
			con_dict["port"] = parts[0][r.start() + 1:]
			parts[0] = parts[0][0:r.start()]
		del r

		#grab the pass.
		parts = parts[0].split(":")
		if len(parts) > 1:
			con_dict["passwd"] = parts.pop(0)

		hash=''
		k=con_dict.keys()
		k.sort()
		for x in k:
			hash += '%s:%s\n' % (x, str(con_dict[x]))
		hash += "host:%s" % parts[0]
		return [proto, parts[0], con_dict, url,hash]

class ftpConnection:
	"""higher level interface over ftplib"""
	def __init__(self,host,ssl=False,port=21,user=None,passwd=None,passive=True, chunk_size=8192,debug=0):
		"""ssl is currently not supported
		debug flips on a bit of extra info
		chunk_size controls the size of chunks transferred per callback- useful for performance tweaking
		
		note a ftpConnection instance *must not* be called in a threaded/parallel manner- 
		the connection can handle one, and only one request, and is currently not smart 
		enough to lock itself for protection"""
		self.__chunk = chunk_size
		self.__pos = 0
		self.__end = 0
		self.__endlimit = False
		if ssl:
			raise Exception, "sftp isn't support atm"
		args=[]
		self.__con = ftplib.FTP()
		self.__con.set_debuglevel(debug)
		self.__con.connect(host,port)
		if user and passwd:
			self.__con.login(user,passwd)
		else:
			self.__con.login()
		self.__con.set_pasv(passive)

	def request(self, file, start=0, end=0, callback=None):
		"""make a request of file, with optional start/end
		callback is used for processing data- if callback, then callback is called, and the return 
		is true/false depending on success

		if no callback, then the requested window is returned, eg string.
		
		technical note: do to the way the protocol works, the remote host may send more then what 
		was specified via start/end.  Despite this, only that window is processed/returned.
		Just worth noting since it means if you're requested 3 bytes from a file, depending on how 
		quickly the server disconnects, 13k may've been sent by that point (still, only the 3 bytes 
		are actually processed by callback/returned)."""

		self.__pos = start
		self.__end = end
		self.__callback = callback
		self.__data = ''
		self.__aborted = False

		if end:
			self.__endlimit = True		
		if end:
			block = end - start
		else:
			block = self.__chunk
		if block > self.__chunk:
			block = self.__chunk

		try:
			d=self.__con.retrbinary("retr %s" % file, self.__transfer_callback, block, start)
		except ftplib.all_errors:
			self.__data = ''
			return False

		if callback == None:
			d = self.__data
			self.__data = ''
			return d
		

	def __transfer_callback(self, data):
		"""internal callback function used with ftplib.  This either appends the returned data,
		or passes it off to the requested callback function"""
		if self.__aborted:
			return

		l=len(data)

		if self.__endlimit and self.__pos + l >= self.__end:
			data = data[:self.__end - l]
			l = self.__end - self.__pos
			self.__aborted = True
			self.__con.abort()

		self.__pos += l
		if self.__callback:
			self.__callback(data)
		else:
			self.__data += data
		return

class httpConnection:
	"""higher level abstraction over httplib allowing window level access to a remote uri"""
	def __init__(self,host,ssl=False,port=None,user=None,passwd=None):
		"""options for this host connection, sent to the server via the headers
		note you cannot just specify the port as 443 and assume it'll do ssl-
		you must flip on ssl to enable descryption/encryption of the protocol

		just like with ftpConnection, instances *must not* be called in a parallel/threaded
		manner without external locking.
		This class isn't smart enough to protect itself, although it will be made so at 
		some point."""

		self.__headers = {}
		self.__host = host
		if user and passwd:
			self.__headers.extend({"Authorization": "Basic %s" %
				base64.encodestring("%s:%s" % (user,passwd)).replace("\012","")
			})

		if port == None:
			if ssl:
				self.__port = httplib.HTTPS_PORT
			else:
				self.__port = httplib.HTTP_PORT
		else:
			self.__port = port
		if ssl:
			self.__conn = httplib.HTTPSConnection(self.__host, self.__port)
		else:
			self.__conn = httplib.HTTPConnection(self.__host, self.__port)

	def request(self,uri,start=None,end=None):
		"""returns httpconnection.response, our chucks an exception."""
		if end==None and start:
			end += 3000
		size = None
		if not (start == None or end == None):
			size = end - start
			self.__headers["Range"]="bytes=%i-%i" % (start, end)
		try:
			self.__conn.request("GET",uri,{},self.__headers)
		except httplib.HTTPException, e:
			print "caught exception %s" % str(e)
			if start and end:
				del self.__headers["Range"]
			return None,False
		if start and end:
			del self.__headers["Range"]
		response = self.__conn.getresponse()
		rc = response.status
		if rc in (301,302):
			response.read()
			for x in str(response.msg).split("\n"):
				p = x.split(":")
				# assume the remote server is dumb, and ignore case.
				if x[0].lower() == "location":
					raise MovedLocation(rc, x[1])

			# if we hit this point, the server is stupid.
			raise Exception,"received %i, but no location" % rc
		
		if rc in (404,403):
			response.read()
			raise UnableToAccess(rc,response.reason)
		elif rc not in (200,206):
			response.read()
			raise UnhandledError(rc,response.reason)
		data=response.read()
		if size != None:
			return data,(rc==206 and len(data) -1 == size)
		return data,(rc==206)


class UnhandledError(Exception):
	"""basically an unknown state/condition was encountered, so control is being relinquished"""
	def __init__(self, status, reason):
		self.status = status
		self.reason = str(reason)
	def __str__(self):
		return "Unhandled Status code: %i: %s" % (self.status, self.reason)

class UnableToAccess(Exception):
	"""404's and 403's"""
	def __init__(self,status,reason):
		self.status = status
		self.reason = str(reason)
	def __str__(self):
		return "%s code: %s" % (self.status,self.reason)

class MovedLocation(Exception):
	"""http 301/302 exception"""
	def __init__(self,status,new_loc):
		self.status=status
		self.location = str(new_loc)

	def __str__(self):
		if self.status == 301:
			return "301: Location has moved: %s" % self.location
		else:
			return "302: Location has temporarily moved: %s" % self.location