kde-base/nepomuk/files/nepomuk-4.8.1-performance.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

commit 754275eda610dce1160286a76339353097d8764c
Author: Sebastian Trueg <trueg@kde.org>
Date:   Fri Mar 9 17:17:48 2012 +0100

    Backport from nepomuk-core: improved performance on res identification.
    
    BUG: 289932
    FIXED-IN: 4.8.2

diff --git a/nepomuk/services/backupsync/lib/resourceidentifier.cpp b/nepomuk/services/backupsync/lib/resourceidentifier.cpp
index c1a9919..894372c 100644
--- a/nepomuk/services/backupsync/lib/resourceidentifier.cpp
+++ b/nepomuk/services/backupsync/lib/resourceidentifier.cpp
@@ -31,6 +31,7 @@
 #include <Soprano/Statement>
 #include <Soprano/Graph>
 #include <Soprano/Node>
+#include <Soprano/BindingSet>
 #include <Soprano/StatementIterator>
 #include <Soprano/QueryResultIterator>
 #include <Soprano/Model>
@@ -176,19 +177,18 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
         return false;
     }
 
-    QString query;
-
     QStringList identifyingProperties;
     QHash<KUrl, Soprano::Node> identifyingPropertiesHash;
 
     QHash< KUrl, Soprano::Node >::const_iterator it = res.constBegin();
     QHash< KUrl, Soprano::Node >::const_iterator constEnd = res.constEnd();
+    QList<Soprano::Node> requiredTypes;
     for( ; it != constEnd; it++ ) {
         const QUrl & prop = it.key();
 
         // Special handling for rdf:type
         if( prop == RDF::type() ) {
-            query += QString::fromLatin1(" ?r a %1 . ").arg( it.value().toN3() );
+            requiredTypes << it.value().uri();
             continue;
         }
 
@@ -219,6 +219,10 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
         return false;
     }
 
+
+    // construct the identification query
+    QString query = QLatin1String("select distinct ?r where { ");
+
     //
     // Optimization:
     // If there is only one identifying property using all that optional and filter stuff
@@ -235,7 +239,7 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
                            QString::number( numIdentifyingProperties++ ) );
         }
 
-        // Make sure atleast one of the identification properties has been matched
+        // Make sure at least one of the identification properties has been matched
         // by adding filter( bound(?o1) || bound(?o2) ... )
         query += QString::fromLatin1("filter( ");
         for( int i=0; i<numIdentifyingProperties-1; i++ ) {
@@ -247,43 +251,68 @@ bool Nepomuk::Sync::ResourceIdentifier::runIdentification(const KUrl& uri)
         query += QString::fromLatin1("?r %1 %2 . ").arg(Soprano::Node::resourceToN3(identifyingPropertiesHash.constBegin().key()),
                                                          identifyingPropertiesHash.constBegin().value().toN3());
     }
-    query += QLatin1String("}");
 
-    // Construct the entire query
-    QString queryBegin = QString::fromLatin1("select distinct ?r count(?p) as ?cnt "
-    "where { ?r ?p ?o. filter( ?p in (%1) ).")
-    .arg( identifyingProperties.join(",") );
-
-    query = queryBegin + query + QString::fromLatin1(" order by desc(?cnt)");
+    //
+    // For performance reasons we add a limit even though this could mean that we
+    // miss a resource to identify since we check the types below.
+    //
+    query += QLatin1String("} LIMIT 100");
 
-    kDebug() << query;
 
     //
-    // Only store the results which have the maximum score
+    // Fetch a score for each result.
+    // We do this in a separate query for performance reasons.
     //
-    QSet<KUrl> results;
-    int score = -1;
+    QMultiHash<int, KUrl> resultsScoreHash;
+    int maxScore = -1;
     Soprano::QueryResultIterator qit = d->m_model->executeQuery( query, Soprano::Query::QueryLanguageSparql );
     while( qit.next() ) {
-        //kDebug() << "RESULT: " << qit["r"] << " " << qit["cnt"];
+        const Soprano::Node r(qit["r"]);
+
+        //
+        // Check the type requirements. Experiments have shown this to mean a substantial
+        // performance boost as compared to doing it in the main query.
+        //
+        if(!requiredTypes.isEmpty() ) {
+            query = QLatin1String("ask where { ");
+            foreach(const Soprano::Node& type, requiredTypes) {
+                query += QString::fromLatin1("%1 a %2 . ").arg(r.toN3(), type.toN3());
+            }
+            query += QLatin1String("}");
+            if(!d->m_model->executeQuery(query, Soprano::Query::QueryLanguageSparql).boolValue()) {
+                continue;
+            }
+        }
+
+
+        const int score = d->m_model->executeQuery(QString::fromLatin1("select count(?p) as ?cnt where { "
+                                                                       "%1 ?p ?o. filter( ?p in (%2) ) . }")
+                                                   .arg( r.toN3(),
+                                                         identifyingProperties.join(",") ),
+                                                   Soprano::Query::QueryLanguageSparql)
+                          .allBindings().first()["cnt"].literal().toInt();
 
-        int count = qit["cnt"].literal().toInt();
-        if( score == -1 ) {
-            score = count;
+        if( maxScore < score ) {
+            maxScore = score;
         }
-        else if( count < score )
-            break;
 
-        results << qit["r"].uri();
+        resultsScoreHash.insert(score, r.uri());
     }
 
+    //
+    // Only get the results which have the maximum score
+    //
+    QSet<KUrl> results = QSet<KUrl>::fromList(resultsScoreHash.values(maxScore));
+
+
     //kDebug() << "Got " << results.size() << " results";
     if( results.empty() )
         return false;
 
     KUrl newUri;
-    if( results.size() == 1 )
+    if( results.size() == 1 ) {
         newUri = *results.begin();
+    }
     else {
         kDebug() << "DUPLICATE RESULTS!";
         newUri = duplicateMatch( res.uri(), results );