aboutsummaryrefslogtreecommitdiff
path: root/pym
diff options
context:
space:
mode:
authorvolpino <fox91@anche.no>2012-07-27 11:26:54 +0200
committervolpino <fox91@anche.no>2012-07-27 11:26:54 +0200
commit326658acb918a083fff3a8e25c24a3d5eff1f7c0 (patch)
tree16bd166ef1c337c9cc24c4131e00cdd038ad2f0c /pym
parenteuscan: Added remote-id handler (diff)
downloadeuscan-326658acb918a083fff3a8e25c24a3d5eff1f7c0.tar.gz
euscan-326658acb918a083fff3a8e25c24a3d5eff1f7c0.tar.bz2
euscan-326658acb918a083fff3a8e25c24a3d5eff1f7c0.zip
euscan: confidence tweak in generic handler
Signed-off-by: volpino <fox91@anche.no>
Diffstat (limited to 'pym')
-rw-r--r--pym/euscan/handlers/url/cpan.py2
-rw-r--r--pym/euscan/handlers/url/generic.py49
-rw-r--r--pym/euscan/handlers/url/github.py2
-rw-r--r--pym/euscan/handlers/url/php.py2
-rw-r--r--pym/euscan/handlers/url/pypi.py2
-rw-r--r--pym/euscan/handlers/url/rubygems.py2
6 files changed, 45 insertions, 14 deletions
diff --git a/pym/euscan/handlers/url/cpan.py b/pym/euscan/handlers/url/cpan.py
index 0721324..5513e0d 100644
--- a/pym/euscan/handlers/url/cpan.py
+++ b/pym/euscan/handlers/url/cpan.py
@@ -6,7 +6,7 @@ import json
from euscan import helpers, output
HANDLER_NAME = "cpan"
-CONFIDENCE = 100.0
+CONFIDENCE = 100
PRIORITY = 90
_cpan_package_name_re = re.compile("mirror://cpan/authors/.*/([^/.]*).*")
diff --git a/pym/euscan/handlers/url/generic.py b/pym/euscan/handlers/url/generic.py
index 1f43a7a..3ba7ac0 100644
--- a/pym/euscan/handlers/url/generic.py
+++ b/pym/euscan/handlers/url/generic.py
@@ -1,7 +1,8 @@
-from urlparse import urljoin
+from urlparse import urljoin, urlparse
import urllib2
import re
import StringIO
+import difflib
try:
from BeautifulSoup import BeautifulSoup
@@ -14,11 +15,40 @@ from euscan import CONFIG, SCANDIR_BLACKLIST_URLS, \
BRUTEFORCE_BLACKLIST_PACKAGES, BRUTEFORCE_BLACKLIST_URLS, output, helpers
HANDLER_NAME = "generic"
-CONFIDENCE = 50.0
+CONFIDENCE = 45
PRIORITY = 0
BRUTEFORCE_HANDLER_NAME = "brute_force"
-BRUTEFORCE_CONFIDENCE = 30.0
+BRUTEFORCE_CONFIDENCE = 30
+
+
+def confidence_score(found, original, minimum=CONFIDENCE):
+ found_p = urlparse(found)
+ original_p = urlparse(original)
+
+ # check if the base url is the same
+ if found_p.netloc != original_p.netloc:
+ return minimum
+
+ # check if the directory depth is the same
+ if len(found_p.path.split("/")) != len(original_p.path.split("/")):
+ return minimum
+
+ # strip numbers
+ found_path = re.sub(r"[\d+\.]?", "", found_p.path)
+ original_path = re.sub(r"[\d+\.]?", "", original_p.path)
+
+ # strip the first equal part of the path
+ i = 0
+ max_i = len(found_path)
+ while i < max_i and found_path[i] == original_path[i]:
+ i += 1
+ found_path = found_path[i:]
+ original_path = original_path[i:]
+
+ # calculate difference ratio
+ diff = difflib.SequenceMatcher(None, found_path, original_path).ratio()
+ return int(minimum + minimum * diff) # maximum score is minimum * 2
def scan_html(data, url, pattern):
@@ -98,7 +128,8 @@ def scan_directory_recursive(cp, ver, rev, url, steps, orig_url):
path = urljoin(url, path)
if not steps and path not in orig_url:
- versions.append((path, pv, HANDLER_NAME, CONFIDENCE))
+ confidence = confidence_score(path, orig_url)
+ versions.append((path, pv, HANDLER_NAME, confidence))
if steps:
ret = scan_directory_recursive(cp, ver, rev, path, steps, orig_url)
@@ -209,14 +240,14 @@ def brute_force(pkg, url):
if helpers.version_filtered(cp, ver, version):
continue
- url = helpers.url_from_template(template, version)
- infos = helpers.tryurl(url, template)
+ try_url = helpers.url_from_template(template, version)
+ infos = helpers.tryurl(try_url, template)
if not infos:
continue
-
- result.append([url, version, BRUTEFORCE_HANDLER_NAME,
- BRUTEFORCE_CONFIDENCE])
+ confidence = confidence_score(try_url, url,
+ minimum=BRUTEFORCE_CONFIDENCE)
+ result.append([try_url, version, BRUTEFORCE_HANDLER_NAME, confidence])
if len(result) > CONFIG['brute-force-false-watermark']:
output.einfo(
diff --git a/pym/euscan/handlers/url/github.py b/pym/euscan/handlers/url/github.py
index e4ebe10..dc5dd16 100644
--- a/pym/euscan/handlers/url/github.py
+++ b/pym/euscan/handlers/url/github.py
@@ -7,7 +7,7 @@ import portage
from euscan import helpers, output
HANDLER_NAME = "github"
-CONFIDENCE = 100.0
+CONFIDENCE = 100
PRIORITY = 90
diff --git a/pym/euscan/handlers/url/php.py b/pym/euscan/handlers/url/php.py
index 853059a..d0fef71 100644
--- a/pym/euscan/handlers/url/php.py
+++ b/pym/euscan/handlers/url/php.py
@@ -6,7 +6,7 @@ import xml.dom.minidom
from euscan import helpers, output
HANDLER_NAME = "php"
-CONFIDENCE = 100.0
+CONFIDENCE = 100
PRIORITY = 90
diff --git a/pym/euscan/handlers/url/pypi.py b/pym/euscan/handlers/url/pypi.py
index 82251e6..02428ee 100644
--- a/pym/euscan/handlers/url/pypi.py
+++ b/pym/euscan/handlers/url/pypi.py
@@ -6,7 +6,7 @@ import portage
from euscan import helpers, output
HANDLER_NAME = "pypi"
-CONFIDENCE = 100.0
+CONFIDENCE = 100
PRIORITY = 90
diff --git a/pym/euscan/handlers/url/rubygems.py b/pym/euscan/handlers/url/rubygems.py
index a3021f0..3b4facd 100644
--- a/pym/euscan/handlers/url/rubygems.py
+++ b/pym/euscan/handlers/url/rubygems.py
@@ -6,7 +6,7 @@ import urllib2
from euscan import helpers, output
HANDLER_NAME = "rubygems"
-CONFIDENCE = 100.0
+CONFIDENCE = 100
PRIORITY = 90