-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmspace.py
executable file
·1589 lines (1263 loc) · 56.3 KB
/
mspace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
# mspace.py - An implementation of metric space indexes for efficient
# similarity search
#
# Copyright (C) 2007,2008 Jochen Schulz
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
# USA.
"""
Metric Spaces
==============
.. contents::
Introduction
-------------
This module provides efficient similarity search by implementing
different data structures for indexing metric spaces. You can use it to
index an arbitrary number of objects which you later want to search for
objects "similar" to a given other object.
For this module to be useful, you need a function with metric properties
(see below for a definition) which you can apply to the type of objects
you want to search for. The only pre-defined metric funtion in this
module is `levenshtein`, which is a common metric to calculate the
(dis)similarity of two strings.
Beware that the indexing phase might take a long time, depending on the
number of objects you want to index and the runtime of your metric
distance function. Indexing only pays off when you are planning to do a
lot of searches.
Definitions
............
A metric space is a pair ``(S, f)`` where ``S`` is an abritrary
(possibly infinite) set of objects and ``f`` is a function which returns
a "distance" for any two objects ``x``, ``y`` in ``S``. This distance is
a number used to express the similarity between the two objects. The
larger the distance, the more they differ. A distance of zero means they
are considered equal.
The function ``f`` has to be a "metric", i.e. the following conditions
have to be met for all ``x``, ``y``, ``z`` in ``S``:
- ``f(x, y) >= 0``
(positivity)
- ``f(x, y) == f(y, x)``
(symmetry)
- ``f(x, z) <= f(x, y) + f(y, z)``
(triangle inequality)
This definition adheres to most people's intuitive understanding of the
distance between two points in a Euclidian space (of arbitrary
dimensionality). [#euclid]_ Imagine you have three points in a
2-dimensional space like this::
A-------B
\ /
\ /
\ /
C
It is evident that the distance between two of these points is
necessarily larger than (or equal to) zero, and that the direction you
take for measuring (from ``A`` to ``B`` or vice versa) has no influence
on the distance. The third condition, the triangle inequality is easy to
grasp, too: if you know the distances between ``A`` and ``B`` and
between ``B`` and ``C``, you can be sure that their sum is equal to or
larger than the distance between ``A`` and ``C``. Voila, there's your
metric space.
In many cases, metric distance functions take a long time to compute.
Because of this, data structures used to index metric spaces are
designed to minimize the number of distance computations necessary for
searching the whole space by using the properties of its metric function
(most notably the triangle inequality) to discard as many objects as
possible after every computation.
.. [#euclid] In fact, Euclidian spaces are one type of metric space.
Use cases & caveats
....................
Metric distance functions can be defined for objects other than points
in Euclidian space, too. One very well known distance function is the
Levenshtein distance, which is applicable to arbitrary strings. Since it
is quite popular and indexing strings is one of the major use cases for
metric spaces, an implementation of this function is included in this
module. Other metric distance functions for strings are the
Damerau-Levenshtein variant and Hamming distance. These can be used for
a variety of tasks, such as spell-checking, record linkage (or
"deduplication"), error detection etc.
Please note that indexing a large number of objects usually takes a
considerable amount of time *and* memory. Indexing only pays off if you
are going to do *a lot* of searches in the index. If you just want to
search a set of objects for all objects similar to one or a few other
objects, you are better off comparing them sequentially by hand with
your distance function.
Installation
.............
This file is a stand-alone module for Python 3.9 and later which you
only have to save somewhere in your ``PYTHONPATH``. [#pythonpath]_ No
installation is necessary. The only tight dependencies are the modules
``sys``, ``random`` and ``weakref`` which are most probably already
included in your Python installation.
.. [#pythonpath] Run ``python -c "import sys; print sys.path"`` to learn
where Python currently looks for modules.
Basic usage
------------
Index creation
...............
Say you have a dictionary file containing one or more words per line.
You can make an index of all the words in it by doing:
.. python::
>>> import mspace
>>> mydict = file('dictionary.txt')
>>> words = mspace.tokenizer(mydict)
>>> index = mspace.VPTree(words, mspace.levenshtein)
You can delay the time-comsuming construction phase by creating the
index without any parameters and explicitly calling ``construct`` at a
later time:
.. python::
>>> index = mspace.VPTree()
>>> index.construct(words, mspace.levenshtein)
Please note that the index' content is dismissed on every call to
``construct``. The index is built from scratch with only the objects and
distance function passed as parameters.
Performing range-searches
..........................
After you have built the index (and got yourself some coffee if the
index is large) you may search it for all objects having distance to a
given other object between arbitrary bounds:
.. python::
>>> index.range_search('WOOD', min_dist=0, max_dist=1)
['GOOD', 'WOOD', 'WOOL', 'MOOD']
In this case, you received four results: the object you searched for
(apparently it has been in your dictionary, too) and two other objects
"GOOD" and "WOOL", both of which have the requested maximum distance of
one to your query object. Result lists are unordered and do not contain
information about their contents' distance to the query object. If you
need this, you have to calculate it by hand.
Both ``min_dist`` and ``max_dist`` are optional parameters defaulting to
zero. Thus, if you leave them out, a search for objects which are
exactly equal (as defined by your distance function) to the query object
is performed. For historical reasons, and since range-searching with a
minimum distance of zero and a maximum greater than zero is quite
common, there is a shortcut to search with ``min_dist=0``:
.. python::
>>> index.search('WOOD', 1)
['GOOD', 'WOOD', 'WOOL', 'MOOD']
Performing nearest-neighbour-searches
......................................
The second type of search you can perform is "k-nearest-neighbour"
search. It returns a given number of objects from the index that are
closest to the query object. Search results are guaranteed to never
contain an object with a distance to the query object that is larger
than that of any other object in the tree.
Result lists are composed of ``(object, distance)`` tuples, sorted
ascendingly by the distance. If you don't specify a maximum number of
matches to return, it defaults to one:
.. python::
>>> index.nn_search('WOOD')
[('WOOD', 0)]
>>> index.nn_search('WOOD', 5)
[('WOOD', 0), ('WOOL', 1), ('GOOD', 1), ('MOOD', 1), ('FOOT', 2)]
Note that the index may contain other objects with a distance of two to
the query object ``'WOOD'`` (e.g. ``'FOOL'``). They have just been cut
off because a maximum of five objects has been requested. You have no
influence on the choice of representatives that is made.
Note that you must not expect this method to always return a list of
length ``num`` because you might ask for more objects than are currently
in the index.
Advanced usage
---------------
Indexing complex objects
.........................
If you have "complex" objects which you want to index by only one
specific attribute, you can write a simple wrapper around `levenshtein`
(or some other function applicable to the attribute) which extracts this
attribute from your objects and returns the distance between their
attributes' value. This way you can search for and retrieve complex
objects from the index while comparing only a single attribute
.. python::
>>> import mspace
>>>
>>> class Record(object):
... def __init__(self, firstname, surname):
... self._firstname, self._surname = firstname, surname
...
>>> def firstnameLev(r1, r2):
... return mspace.levenshtein(r1._firstname, r2._firstname)
...
>>> index = mspace.VPTree(listOfRecords, firstnameLev)
>>> rec = Record("Paul", "Brady")
>>> index.search(rec, 2)
[<Record: 'Paula Bean'>, <Record: 'Raoul Perky'>, <Record: 'Paul Simon'>]
Of course you can also use more advanced techniques like writing a
function factory which returns a function that extracts arbitrary
attributes from your objects and applies a metric function to it:
.. python::
>>> def metric_using_attr(metric, attr):
... def f(one, other, attr=attr):
... attr1, attr2 = getattr(one, attr), getattr(other, attr)
... return metric(attr1, attr2)
... return f
...
>>> metric = metric_using_attr(mspace.levenshtein, "_firstname")
>>> metric( Record("Paul", "Brady"), Record("Paul", "Simon") )
0
(Note that the inner function ``f`` in this example deviates from the
standard protocol by accepting an optional third parameter. This is
done here only to pull the name ``attr`` into the inner function's
namespace to save some time when looking up its associated value. No
index structure in this module will ever call its distance function with
more than two parameters anyway, so that whatever you passed as ``attr``
to ``metric_using_attr`` when creating your function will be used when
this function gets called by an index. Got it?)
Reverse indexes
................
Of course you can use a similar technique to avoid full indexes and
instead create an index which only contains references to your data (in
a database, text file, somewhere on the net etc.). Your distance
function would then use the supplied values to fetch the objects to
compare from your data source. But, I cannot stress this enough, your
distance function's performance is absolutely crucial to efficient
indexing and searching. Therefore you should make sure that your way of
accessing the data is really fast.
Choice of data structure
-------------------------
This module currently gives you two data structures to choose from.
While the underlying algorithms are different, their search results
(given the same set of objects to index, distance function and maximum
distance) are exactly the same. If you find a scenario where this is not
the case, please let me know because this would be a serious bug.
Capabilities
.............
The algorithms implemented in this module have different capabilities
which are displayed in the table below:
+--------------------+-------+-------+
| |VPTree |BKTree |
+====================+=======+=======+
|non-discrete metric | X | |
+--------------------+-------+-------+
|insertion | | X |
+--------------------+-------+-------+
|deletion | | |
+--------------------+-------+-------+
The first row is about the value returned by the distance function the
index uses. BKTrees are not prepared to handle non-discrete distances
(for example floats) and will most probably perform really bad when they
occur.
The other two rows describe what you can do with the indexes *after*
they have been constructed. Please note that neither of them may shrink
over time. Only BKTrees are able to handle insertion efficiently.
Performance
............
[In short: you are probably better off using BKTrees unless you need a
non-discrete metric. When in doubt, test both options.]
Obviously, the whole point of this module is to speed up the process of
finding objects similar to one another. And, as you can imagine, the
different algorithms perform differently when exposed to a specific
problem.
Here's an example: I took the file ``american-english-large`` from the
Debian package ``wamerican-large`` and did some time measurements of
indexing and searching a random subset of it. The machine I used is an
1.1GHz Pentium3 running Python 2.4 from Debian stable **with Psyco
enabled**. Please note that the following numbers have been determined
completely unscientifical. I only show them to give you an idea of what
to expect. For serious benchmarking, I should have used the ``timeit``
module. Nevertheless, this is it:
+------+------------------------+------------------------+
| |BKTree |VPTree |
+======+======+=========+=======+=======+========+=======+
|size |time |per node |height |time |per node|height |
+------+------+---------+-------+-------+--------+-------+
| 5000 | 2.92|0.000584 |11 | 7.40|0.001480| 21|
+------+------+---------+-------+-------+--------+-------+
|10000 | 6.32|0.000632 |14 | 16.02|0.001602| 22|
+------+------+---------+-------+-------+--------+-------+
|15000 | 9.95|0.000663 |14 | 28.35|0.001890| 24|
+------+------+---------+-------+-------+--------+-------+
|20000 | 13.70|0.000685 |14 | 41.40|0.002070| 24|
+------+------+---------+-------+-------+--------+-------+
|25000 | 17.46|0.000699 |15 | 50.63|0.002025| 25|
+------+------+---------+-------+-------+--------+-------+
|30000 | 21.81|0.000727 |15 | 55.47|0.001849| 25|
+------+------+---------+-------+-------+--------+-------+
|35000 | 25.77|0.000736 |16 | 64.43|0.001841| 26|
+------+------+---------+-------+-------+--------+-------+
|40000 | 29.40|0.000735 |16 | 75.45|0.001886| 26|
+------+------+---------+-------+-------+--------+-------+
|45000 | 41.28|0.000917 |16 | 96.36|0.002141| 26|
+------+------+---------+-------+-------+--------+-------+
|50000 | 37.62|0.000752 |16 | 95.31|0.001906| 28|
+------+------+---------+-------+-------+--------+-------+
This table shows construction times (total and per node in seconds) for
data sets of a given size. Additionally, you can see the height
[#height]_ of the trees. Apparently, BKTrees can be constructed a lot
faster than VPTrees. Both of them need an increasing time per node with
growing data sets. This is expected since construction complexity is
``O(n log(n))`` in both cases.
+---------------+-----------------+-----------------+
| |BK, size: 50,000 |VP, size: 50,000 |
+------+--------+-------+---------+-------+---------+
|k |results | time | # dist | time | # dist |
+------+--------+-------+---------+-------+---------+
|0 | 0.89 |0.0008 | 8.14 | 0.0019| 17.28|
+------+--------+-------+---------+-------+---------+
|1 | 4.07 |0.1670 | 1583.77 | 0.2801| 1933.64|
+------+--------+-------+---------+-------+---------+
|2 | 38.10 |1.1687 |10353.31 | 1.4413| 13845.67|
+------+--------+-------+---------+-------+---------+
|3 | 304.57 |2.5614 |22202.28 | 3.0497| 27514.51|
+------+--------+-------+---------+-------+---------+
|4 |1584.86 |3.8727 |32376.54 | 4.1518| 36877.62|
+------+--------+-------+---------+-------+---------+
|5 |5317.03 |4.4182 |39616.04 | 4.9935| 42720.38|
+------+--------+-------+---------+-------+---------+
This table shows the runtime of searches done in the trees (in seconds),
the number of distance calculations done and the number of results for
growing error tolerance. All values are given as average over 100 random
searches (from the same file, but not necessarily from the set of
indexed objects). As you can see, search runtime (tightly connected to
the number of distance calculations) literally explodes for larger
values of ``k``. This growth only fades when an increased part of the
complete search space is examined (the number of distance calculations
equals the number of nodes compared with the query object).
As you can see, too, long search runtimes for large values of ``k``
don't actually hurt usability very much since you only get a usable
number of results for small ``k`` anyway. This is of course due to the
nature of the data set and the distance function used in this example.
Your application may vary greatly.
You also have to keep in mind that the dictionary I used contains almost
no duplicates. If you used the words of a real document as your data
set, your tree would have significantly less nodes than the number of
words in your document since you typically repeat a lot of words very
often. A quick test with my diploma thesis revealed only 4400 distinct
words in a document with 14,000 words (including LaTeX commands). This
makes searching much faster because equal objects (as defined by the
distance function) are only evaluated once.
.. [#height] If you look closely, you can see that the VPTree's height
doesn't grow continuously when the data set grows. The reason for
that phenomenon is that this implementation does not try very hard to
pick a good vantage point which is the key factor to get
well-balanced trees. So, in some cases, a vantage point may be chosen
that may result in trees which are higher than strictly necessary.
Optimization potential (or: Why the f*ck is this so slow!?)
------------------------------------------------------------
Ok, you have tried to index your small dictionary file and start to
wonder why this easy task takes several minutes. Here are a couple of
(possible) reasons:
- Indexing takes ``O(n log(n))`` for all data structures currently
implemented. So yes, doubling the number of indexed objects will
necessarily more than double your runtime for indexing, sorry.
- Python is slow. I have written an implementation very similar to this
one in Java which is significantly faster (by a factor of about 15 to
25, even when using Psyco_!). But the java implementation eats more
memory and unfortunately I cannot release it under a free license.
- Recursion is slow in Python. Recursion is the most natural way to
create and search trees and this code uses it a lot. Most of the
recursion in this module is "tail recursion", but the Python
interpreter is not able to optimize it into loops.
[Note: as of revision 60, searching in both trees and inserting into
BKTrees has been rewritten using loops instead of recursion.
Perfomance gain is quite small, though.]
- The distance distribution in your metric space is too dense. This
leads to the data structures being unable to discard large parts of
the indexed space while searching. In pathological cases, you may end
up with data structures resembling linked lists.
- Your distance function is non-discrete, i.e. it returns floats. How
well this is supported depends on the index structure in use.
- Your metric function is very expensive. Remember that this function
has to be called *very often* during indexing. You may use this
module's attribute ``dist_ctr`` to get an idea how often this is done.
It is incremented by one on every call to your distance function and
is otherwise unused.
- You are searching for non-similar objects. If you, for example, have
an index of strings with an average length of six characters and you
are continuously searching for strings with a maximum distance of
three or more, do not expect the search to be significantly faster
than linear testing of the complete search space. It may even be
slower.
Possible solutions include:
- Rewrite in C. Since I cannot be bothered to learn C, someone else
would have to do this.
- Use Pyrex or Psyco_. Quick tests with Psyco suggest that indexing
becomes about 3-4 times faster. This is as easy as doing an ``import
psyco; psyco.full()`` in your program. Read Psyco's manual for tuning
options.
[Note: as of about revision 46, on module loading an attempt is made
to import and enable Psyco for levenshtein and the tree classes. If it
doesn't succeed, you'll get a performance warning on ``stderr`` but
everything will continue to work flawlessly.]
- Pickle trees for static data sets. Pickling and unpickling indexes
using Python's standard ``pickle`` module is quite fast. But beware
that a pickled index takes a lot more space than your plain data.
- Rewrite tree construction and searching using loops. I am not sure
whether this will be faster, but it will probably take less memory
than the current approach and fix the problem with Python's recursion
limit. I fear that code readibility will suffer, though. The current
implementations are quite close to the original descriptions of the
algorithms.
[Note: partially done, see above.]
- Optimize the distance function in use. Since I do not know your
distance function, I cannot help you with this.
As for `levenshtein`: the algorithm implemented in this module is the
classical dynamic programming variant with ``O(|n|*|m|)`` time
complexity (and ``O(min(|n|, |m|))`` for space). Other algorithms for
Levenshtein try not to compute the whole matrix but only the values
around the diagonal of the matrix that are strictly necessary. By
doing this, the average ``O(|n|*|m|)`` of the classical dynamic
programming algorithm could be improved to something like
``O(k*|n|)``. While it isn't obvious how much wall clock time this
approarch saves in reality, it may be in fact a whole lot faster
because ``k`` is usually only a fraction of the length of the input
strings.
And if you take a close look at the algorithms, you may realize that
searching may be sped up by using an algorithm that doesn't actually
compute the distance between two strings, but one that only answers
whether two strings have a distance less than or equal to a specified
maximum distance. This can be done in ``O(k^2)`` and should boost
search performance quite noticeably.
.. _Citeseer: http://citeseer.ist.psu.edu/navarro99guided.html
If you are interested in any of these optimizations, I suggest reading
Gonzalo Navarro's excellent survey "Guided Tour To Approximate String
Matching" which you can get from Citeseer_.
Currently, this module always uses the same function for searching and
indexing. If you want to test your own implementations, you have to
replace your index object's ``_func`` attribute after indexing and
before searching.
Contact
--------
For comments, patches, flames and hugs send mail to:
jrschulz@well-adjusted.de.
"""
__docformat__ = 'restructuredtext en'
__author__ = '$Author: jrschulz $'
__version__ = '$Revision: 117 $'
__date__ = '$Date: 2008-07-23 23:01:46 +0200 (Wed, 23 Jul 2008) $'
__license__ = "GPL"
import sys, random, weakref
class MetricTree(object):
"""
Base class for metric space indexes which have tree-like properties.
It's implementation is not complete enough to make it actually
usable, but it should make it quite easy to implement other indexing
and searching strategies. In an ideal case, all you have to do is to
derive from this class, implement the methods `construct`,
`_get_child_candidates` and optionally `insert` and make `children`
an attribute or property returning all child nodes of the current
node. If you need an ``__init__`` method, you should call
``super(YourClass, self).__init__(objects, func, parent)`` in its
end as well.
Instances of this class (and its subclasses, of course) are supposed
to be recursive data structures. This means that they may be treated
like the root of a tree whether they have a parent or not. This
means every node has:
- a list of `_values` with every element of that list having a
distance of zero to all other elements in the list. Because of
triangle inequality, you only need to check this condition against
one element of this list.
- a `_parent` (`None` for root nodes)
- a list of `children` (empty list for leaf nodes)
- a `_height` (one for trees consisting of only one node)
- a number of nodes `_num_nodes` (the total number of nodes in this
subtree)
- a `_size` (the total number of objects stored in this subtree)
Note that in many (most? some?) cases, implementors don't have to
fiddle with these values directly. Instead, there are some helper
methods like `_incr_node_counter` and `_incr_size` that may be
called at appropriate times. If in doubt, just implement what you
strictly need first and then take a look whether node, size and
height counting works as expected (and whether you care).
As a bonus, this class implements some of python's magic methods so
that you can iterate over its instances' content and use the ``in``
operator to test for the existence of an object using the distance
function.
"""
_parent = None
"""
The parent of this node. ``None`` for root nodes.
"""
_values = list()
"""
A list of values in this subtree. All objects in this list are
considered equal by the distance function in use.
"""
_size = 0
"""
The number of objects in this subtree.
"""
_func = 0
"""
The distance function used for indexing and searching. It has to
accept any two objects from the list of objects you are indexing as
arguments and return a non-negative integer (or long).
"""
_height = 0
"""
"""
_num_nodes = 0
"""
"""
def __init__(self, objects=None, func=None, parent=None):
"""
Create a new metric tree. If ``objects`` and ``func`` are given,
the given objects will be indexed immediately using the distance
function which makes it possible to immediately start so search
for other objects in it. Otherwise, you have to call `construct`
later in order to make use of this metric tree.
"""
self._values = list()
self._size = 0
self._height = 1
self._func = func
self.parent = parent
self._num_nodes = 0
self._incr_node_counter()
self._calculate_height()
if objects and func:
self.construct(objects, func)
def range_search(self, obj, min_dist=0, max_dist=0):
"""
Return a list of all objects in this subtree whose distance to
`obj` is at least `min_dist` and at most `max_dist`. `min_dist`
and `max_dist` have to satisfy the condition ``0 <= min_dist <=
max_dist``.
If this metric tree is empty, an empty list is returned,
regardless of whether this tree has been previously constructed
or not.
If calling the distance function fails for any reason,
`UnindexableObjectError` will be raised.
"""
assert( 0 <= min_dist <= max_dist )
if not self: return list()
result, candidates = list(), [self]
while candidates:
node = candidates.pop()
distance = node._get_dist(obj)
if min_dist <= distance <= max_dist:
result.extend(node._values)
candidates.extend( node._get_child_candidates(
distance, min_dist, max_dist))
return result
def search(self, obj, max_dist):
"""
Equivalent to range_search(obj, min_dist=0, max_dist).
"""
return self.range_search(obj, max_dist=max_dist)
def nn_search(self, obj, num=1):
"""
Perform a k-nearest-neighbour search and return a sorted list of
at most ``num`` objects together with their distance to the
query object (``(object, distance)`` tuples). Sorting is done by
the distance (ascending). Of course, `num` has to be an ``int``
(or ``long``, for that matter) larger than zero.
For obvious reasons, this method cannot return more objects than
are currently present in the tree. That means, if ``num >
len(self)``, only ``len(self)`` pairs of ```(object, distance)``
will be returned.
If this metric tree is empty, an empty list is returned,
regardless of whether this tree has been previously constructed
or not.
If several objects in this tree share the same maximum distance
to `obj`, no guarantee is given about which of these objects
will be returned and which are left out.
If calling the distance function fails for any reason,
`UnindexableObjectError` will be raised.
"""
if num < 1: return list()
if not self: return list()
neighbours = list() # list of (value, distance) tuples
candidates = [self]
def cmp_neighbours(a, b): return cmp(a[1], b[1])
while candidates:
cand = candidates.pop()
distance = cand._get_dist(obj)
candidate_is_neighbour = bool([1 for n in neighbours
if distance < n[1]])
if candidate_is_neighbour or len(neighbours) < num:
neighbours.extend([(v, distance) for v in cand._values])
neighbours.sort(cmp_neighbours)
if len(neighbours) > num:
neighbours = neighbours[:num]
elif len(neighbours) == num:
max_dist = neighbours[-1][1]
if max_dist == 0:
break
candidates.extend(
node._get_child_candidates( distance, 0, max_dist))
else:
candidates.extend(node.children)
return neighbours
def _get_child_candidates(self, distance, min_dist, max_dist):
"""
Return a sequence of child nodes that may contain objects with a
distance difference between (inclusive) ``min`` and ``max`` to a
certain query object. Note that the query object is *not*
passed to this method. Instead, ``distance`` is the query
object's previously calculated distance to this node.
"""
raise NotImplementedError()
def construct(self, objects, func):
"""
(Re)Index this space with the given ``objects`` using the
distance function ``func``. Previous contents will be
discarded. ``objects`` has to be a sequence or an iterable. The
distance function ``func`` needs to be applicable to all objects
contained in ``objects``.
If calling the distance function fails for any reason,
`UnindexableObjectError` will be raised.
"""
raise NotImplementedError()
def insert(self, obj):
"""
Insert a single object into the metric tree. Returns ``self``,
i.e. the tree itself.
This method may not be implemented by all subclasses of
`MetricTree` since not all data structures allow to do this
efficiently. `NotImplementedError` will be raised when this is
the case.
If calling the distance function fails for any reason,
`UnindexableObjectError` will be raised.
"""
raise NotImplementedError()
def is_root(self):
"""
Answer whether this node is the root of a tree (i.e. it has no
parent).
"""
return self.parent is None
def is_leaf(self):
"""
Answer whether this node is a leaf node (i.e. it has no
children)
"""
return not self.children
def __children(self):
"""
A sequence of this node's children.
The possible number of children per node is
implementation-dependent. Leaf nodes return an empty sequence.
"""
raise NotImplementedError()
children = property(__children)
def __num_nodes(self):
"""
The number of nodes in this tree.
This may be different from the number of objects contained in
this tree in cases where two or more of these objects are
considered equal by the distance function in use (i.e., for two
objects ``p`` and ``q``, calling ``self._func(p, q)`` returns
``0`` and when the tree is empty, i.e. there is one node (the
root node) but it doesn't contain any values.
"""
return self._num_nodes
num_nodes = property(__num_nodes)
def __height(self):
"""
The height of this tree.
Empty trees have a height of ``0``, trees containing one or more
objects have a height ``>= 1``.
"""
return self._height
height = property(__height)
def __parent(self):
"""
The parent of this node. `None` if this node is the root
of a tree.
"""
return self._parent
def __set_parent(self, parent):
"""
Set the parent of this node.
Parent references are stored as "weak references" to avoid
circular references which the garbage collector cannot dissolve
by itself.
"""
if parent is None:
self._parent = None
else:
# the callback ensures that the weakref is deleted
# as soon as the parent node disappears so that this
# node recognizes it is the new root.
callback = lambda proxy: self.__set_parent(None)
self._parent = weakref.proxy(parent, callback)
parent = property(__parent, __set_parent)
def _incr_size(self, incr=1):
"""
Increment the size counter for this node and all its parents
recursively.
Should be called whenever an object is inserted into the tree.
"""
def f(node, incr=incr): node._size += incr
self._apply_upwards(f)
def _calculate_height(self, recursive=True):
"""
Set this node's height to one and (if `recursive` is `True`)
propagate this change upwards in the tree.
"""
self._height = height = 1
if recursive:
node = self.parent
while node is not None:
height += 1
if node._height < height:
node._height = height
node = node.parent
else:
node = None
def _incr_node_counter(self, incr=1):
"""
Increment the node counter for this node and all its parents
recursively.
Should be called whenever a new child of this node is created.
"""
def f(node, incr=incr): node._num_nodes += incr
self._apply_upwards(f)
def _apply_upwards(self, func, **args):
"""
Helper method to apply a function to this node and all its
parents recursively. The given function must accept one node as
the first parameter and may accept arbitrary keyword parameters
as well.
"""
node = self
func(node, **args)
while not node.is_root():
node = node.parent
func(node, **args)
def _get_dist(self, obj):
"""
Apply this node's distance function to the given object and one
of this node's values.
Raises `UnindexableObjectError` when distance computation fails.
"""
global dist_ctr
try:
distance = self._func(self._values[0], obj)
dist_ctr += 1
except IndexError as e:
sys.stderr.write("Node is empty, cannot calculate distance!\n")
raise e
except Exception as e:
raise UnindexableObjectError(e, "Cannot calculate distance"
+ " between objects %s and %s using %s" \
% (self._values[0], obj, self._func))
return distance
def __iter__(self):
"""
A generator that yields all objects in this node and its
children by doing recursive pre-order traversal. Implementors
might choose to use another traversal method which better suits
their data structure.
Note that objects are returned in no specific order.
This implementation will always return objects in the same order
as long as the tree's content does not change and the
implementation of `children` always returns the children in the
same order. If `children` is not implemented at all,
`NotImplementedError` will be raised.
"""
for obj in self._values:
yield obj
for child in self.children:
for obj in child:
yield obj
itervalues = __iter__
def iternodes(self):
"""
A generator that yields all nodes in this subtree by doing
recursive pre-order traversal. Implementors might choose to use
another traversal method which better suits their data
structure.
Note that objects are returned unordered.
This implementation will always return nodes in the same order