jwilder · Mar 9, 2012
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎AUTHORS.md
+2 b/‎AUTHORS.md
+2
diff --git a/‎LICENSE
+20 b/‎LICENSE
+20
diff --git a/‎collection-stats.py
-99 b/‎collection-stats.py
-99
diff --git a/‎index-stats.py
-116 b/‎index-stats.py
-116
diff --git a/‎query/__init__.py ‎mongodbtools/__init__.py b/‎query/__init__.py ‎mongodbtools/__init__.py
diff --git a/‎mongodbtools/collection_stats.py
+103 b/‎mongodbtools/collection_stats.py
+103
diff --git a/‎mongodbtools/index_stats.py
+120 b/‎mongodbtools/index_stats.py
+120
diff --git a/‎mongodbtools/query/__init__.py b/‎mongodbtools/query/__init__.py
diff --git a/‎query/helpers.py ‎mongodbtools/query/helpers.py
+22-16 b/‎query/helpers.py ‎mongodbtools/query/helpers.py
+22-16
diff --git a/‎mongodbtools/query/parser.py
+146 b/‎mongodbtools/query/parser.py
+146
diff --git a/‎mongodbtools/redundant_indexes.py
+46 b/‎mongodbtools/redundant_indexes.py
+46
diff --git a/‎redundant-indexes.py
-39 b/‎redundant-indexes.py
-39
diff --git a/‎setup.py
+29 b/‎setup.py
+29
@@ -2,3 +2,6 @@
 *.pyc
 distribute-0.6.10.tar.gz
 virtualenv
+build
+dist
+mongodbtools.egg-info
@@ -0,0 +1,2 @@
+## Authors
+* Jason Wilder
@@ -0,0 +1,20 @@
+Copyright (c) 2012 Jason Wilder
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+"""
+This script prints some basic collection stats about the size of the
+collections and their indexes.
+"""
+
+from prettytable import PrettyTable
+import psutil
+from pymongo import Connection
+from pymongo import ReadPreference
+
+connection = Connection(read_preference=ReadPreference.SECONDARY)
+
+def compute_signature(index):
+    signature = index["ns"]
+    for key in index["key"]:
+        signature += "%s_%s" % (key, index["key"][key])
+    return signature
+
+def get_collection_stats(database, collection):
+    print "Checking DB: %s" % collection.full_name
+    return database.command("collstats", collection.name)
+
+# From http://www.5dollarwhitebox.org/drupal/node/84
+def convert_bytes(bytes):
+    bytes = float(bytes)
+    if bytes >= 1099511627776:
+        terabytes = bytes / 1099511627776
+        size = '%.2fT' % terabytes
+    elif bytes >= 1073741824:
+        gigabytes = bytes / 1073741824
+        size = '%.2fG' % gigabytes
+    elif bytes >= 1048576:
+        megabytes = bytes / 1048576
+        size = '%.2fM' % megabytes
+    elif bytes >= 1024:
+        kilobytes = bytes / 1024
+        size = '%.2fK' % kilobytes
+    else:
+        size = '%.2fb' % bytes
+    return size
+
+def main():
+    summary_stats = {
+        "count" : 0,
+        "size" : 0,
+        "indexSize" : 0
+    }
+    all_stats = []
+
+    all_db_stats = {}
+    for db in connection.database_names():
+        # FIXME: Add an option to include oplog stats.
+        if db == "local":
+            continue
+
+        database = connection[db]
+        all_db_stats[database.name] = []
+        for collection_name in database.collection_names():
+            stats = get_collection_stats(database, database[collection_name])
+            all_stats.append(stats)
+            all_db_stats[database.name].append(stats)
+
+            summary_stats["count"] += stats["count"]
+            summary_stats["size"] += stats["size"]
+            summary_stats["indexSize"] += stats.get("totalIndexSize", 0)
+
+    x = PrettyTable(["Collection", "Count", "% Size", "DB Size", "Avg Obj Size", "Indexes", "Index Size"])
+    x.set_field_align("Collection", "l")
+    x.set_field_align("% Size", "r")
+    x.set_field_align("Count", "r")
+    x.set_field_align("DB Size", "r")
+    x.set_field_align("Avg Obj Size", "r")
+    x.set_field_align("Index Size", "r")
+    x.set_padding_width(1)
+
+    print
+
+    for db in all_db_stats:
+        db_stats = all_db_stats[db]
+        count = 0
+        for stat in db_stats:
+            count += stat["count"]
+            x.add_row([stat["ns"], stat["count"], "%0.1f%%" % ((stat["size"] / float(summary_stats["size"])) * 100),
+                       convert_bytes(stat["size"]),
+                       convert_bytes(stat.get("avgObjSize", 0)),
+                       stat.get("nindexes", 0),
+                       convert_bytes(stat.get("totalIndexSize", 0))])
+
+    print
+    x.printt(sortby="% Size")
+    print "Total Documents:", summary_stats["count"]
+    print "Total Data Size:", convert_bytes(summary_stats["size"])
+    print "Total Index Size:", convert_bytes(summary_stats["indexSize"])
+
+    ram_headroom = psutil.phymem_usage()[0] - summary_stats["indexSize"]
+    print "RAM Headroom:", convert_bytes(ram_headroom)
+    print "RAM Used: %s (%s%%)" % (convert_bytes(psutil.phymem_usage()[1]), psutil.phymem_usage()[3])
+    print "Available RAM Headroom:", convert_bytes((100 - psutil.phymem_usage()[3]) / 100 * ram_headroom)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+
+"""
+This script prints some basic collection stats about the size of the
+collections and their indexes.
+"""
+
+from prettytable import PrettyTable
+import psutil
+from pymongo import Connection
+from pymongo import ReadPreference
+
+connection = Connection(read_preference=ReadPreference.SECONDARY)
+
+def compute_signature(index):
+    signature = index["ns"]
+    for key in index["key"]:
+        signature += "%s_%s" % (key, index["key"][key])
+    return signature
+
+def get_collection_stats(database, collection):
+    print "Checking DB: %s" % collection.full_name
+    return database.command("collstats", collection.name)
+
+# From http://www.5dollarwhitebox.org/drupal/node/84
+def convert_bytes(bytes):
+    bytes = float(bytes)
+    if bytes >= 1099511627776:
+        terabytes = bytes / 1099511627776
+        size = '%.2fT' % terabytes
+    elif bytes >= 1073741824:
+        gigabytes = bytes / 1073741824
+        size = '%.2fG' % gigabytes
+    elif bytes >= 1048576:
+        megabytes = bytes / 1048576
+        size = '%.2fM' % megabytes
+    elif bytes >= 1024:
+        kilobytes = bytes / 1024
+        size = '%.2fK' % kilobytes
+    else:
+        size = '%.2fb' % bytes
+    return size
+
+def main():
+    summary_stats = {
+        "count" : 0,
+        "size" : 0,
+        "indexSize" : 0
+    }
+    all_stats = []
+
+    all_db_stats = {}
+    for db in connection.database_names():
+        # FIXME: Add an option to include oplog stats.
+        if db == "local":
+            continue
+
+        database = connection[db]
+        all_db_stats[database.name] = []
+        for collection_name in database.collection_names():
+            stats = get_collection_stats(database, database[collection_name])
+            all_stats.append(stats)
+            all_db_stats[database.name].append(stats)
+
+            summary_stats["count"] += stats["count"]
+            summary_stats["size"] += stats["size"]
+            summary_stats["indexSize"] += stats.get("totalIndexSize", 0)
+
+    x = PrettyTable(["Collection", "Index","% Size", "Index Size"])
+    x.set_field_align("Collection", "l")
+    x.set_field_align("Index", "l")
+    x.set_field_align("% Size", "r")
+    x.set_field_align("Index Size", "r")
+    x.set_padding_width(1)
+
+    print
+
+    index_size_mapping = {}
+    for db in all_db_stats:
+        db_stats = all_db_stats[db]
+        count = 0
+        for stat in db_stats:
+            count += stat["count"]
+        for index in stat["indexSizes"]:
+                index_size = stat["indexSizes"].get(index, 0)
+                row = [stat["ns"], index,
+                          "%0.1f%%" % ((index_size / float(summary_stats["indexSize"])) * 100),
+                  convert_bytes(index_size)]
+                index_size_mapping[index_size] = row
+                x.add_row(row)
+
+
+    print "Index Overview"
+    x.printt(sortby="Collection")
+
+    print
+    print "Top 5 Largest Indexes"
+    x = PrettyTable(["Collection", "Index","% Size", "Index Size"])
+    x.set_field_align("Collection", "l")
+    x.set_field_align("Index", "l")
+    x.set_field_align("% Size", "r")
+    x.set_field_align("Index Size", "r")
+    x.set_padding_width(1)
+
+    top_five_indexes = sorted(index_size_mapping.keys(), reverse=True)[0:5]
+    for size in top_five_indexes:
+        x.add_row(index_size_mapping.get(size))
+    x.printt()
+    print
+
+    print "Total Documents:", summary_stats["count"]
+    print "Total Data Size:", convert_bytes(summary_stats["size"])
+    print "Total Index Size:", convert_bytes(summary_stats["indexSize"])
+
+    ram_headroom = psutil.phymem_usage()[0] - summary_stats["indexSize"]
+    print "RAM Headroom:", convert_bytes(ram_headroom)
+    print "Available RAM Headroom:", convert_bytes((100 - psutil.phymem_usage()[3]) / 100 * ram_headroom)
+
+if __name__ == "__main__":
+    main()
@@ -25,6 +25,23 @@ def bson_iter(bson_file):
             raise InvalidBSON("bad eoo")
         yield bson._bson_to_dict(size_str + obj, dict, True)[0]
 
+def _deep_get(obj, field):
+    parts = field.split(".")
+    if len(parts) == 1:
+        return obj.get(field)
+
+    last_value = {}
+    for part in parts[0:-1]:
+        last_value  = obj.get(part)
+
+    if not last_value:
+        return False
+
+    if isinstance(last_value, dict):
+        return last_value.get(parts[-1])
+    else:
+        return getattr(last_value, parts[-1])
+
 def groupby(iterator, field):
     """
     Returns dictionary with the keys beign the field to group by
@@ -34,8 +51,10 @@ def groupby(iterator, field):
     for example.
     """
     groups = {}
-    for k, g in itertools.groupby(iterator, lambda x: x.get(field)):
-        groups.setdefault(k, []).append(g)
+    for k, g in itertools.groupby(iterator, lambda x: _deep_get(x, field)):
+        items = groups.setdefault(k, [])
+        for item in g:
+            items.append(item)
     return groups
 
 def filter(iterator, field, value):
@@ -45,18 +64,5 @@ def filter(iterator, field, value):
     The field can be a nested field like a.b.c and it will descend into the
     embedded documents.
     """
-    def deep_get(obj, field, value):
-        parts = field.split(".")
-        if len(parts) == 1:
-            return obj.get(field) == value
-
-        last_value = {}
-        for part in parts[0:-1]:
-            last_value  = obj.get(part)
-
-        if not last_value:
-            return False
-
-        return last_value.get(parts[-1]) == value
 
-    return itertools.ifilter(lambda x: deep_get(x, field, value), iterator)
+    return itertools.ifilter(lambda x: _deep_get(x, field) == value, iterator)
@@ -0,0 +1,146 @@
+# simpleSQL.py
+#
+# simple demo of using the parsing library to do simple-minded SQL parsing
+# could be extended to include where clauses etc.
+#
+# Copyright (c) 2003, Paul McGuire
+#
+# Originally from http://pyparsing.wikispaces.com/file/view/simpleSQL.py
+
+from pyparsing import Literal, CaselessLiteral, Word, upcaseTokens, delimitedList, Optional, \
+    Combine, Group, alphas, nums, alphanums, ParseException, Forward, oneOf, quotedString, \
+    ZeroOrMore, restOfLine, Keyword
+
+def test( str ):
+    print str,"->"
+    try:
+        tokens = simpleSQL.parseString( str )
+        print "tokens = ",        tokens
+        print "tokens.columns =", tokens.columns
+        print "tokens.tables =",  tokens.tables
+        print "tokens.where =", tokens.where
+    except ParseException, err:
+        print " "*err.loc + "^\n" + err.msg
+        print err
+    print
+
+
+# define SQL tokens
+selectStmt = Forward()
+selectToken = Keyword("select", caseless=True)
+fromToken   = Keyword("from", caseless=True)
+whereToken  = Keyword("where", caseless=True)
+
+ident          = Word( alphas+"_", alphanums + "_$." ).setName("identifier")
+columnName     = delimitedList( ident, ".", combine=True )
+columnNameList = Group( delimitedList( columnName ) )
+tableName      = delimitedList( ident, ".", combine=True )
+tableNameList  = Group( delimitedList( tableName ) )
+
+whereExpression = Forward()
+and_ = Keyword("and", caseless=True)
+or_ = Keyword("or", caseless=True)
+in_ = Keyword("in", caseless=True)
+
+E = CaselessLiteral("E")
+binop = oneOf("= != < > >= <= eq ne lt le gt ge", caseless=True)
+arithSign = Word("+-",exact=1)
+realNum = Combine( Optional(arithSign) + ( Word( nums ) + "." + Optional( Word(nums) )  |
+                                                         ( "." + Word(nums) ) ) +
+            Optional( E + Optional(arithSign) + Word(nums) ) )
+intNum = Combine( Optional(arithSign) + Word( nums ) +
+            Optional( E + Optional("+") + Word(nums) ) )
+
+columnRval = realNum | intNum | quotedString | columnName # need to add support for alg expressions
+whereCondition = Group(
+    ( columnName + binop + columnRval ) |
+    ( columnName + in_ + "(" + delimitedList( columnRval ) + ")" ) |
+    ( "(" + whereExpression + ")" )
+    )
+whereExpression << whereCondition + ZeroOrMore( ( and_ | or_ ) + whereExpression )
+
+# define the grammar
+selectStmt      << ( selectToken +
+                   ( '*' | columnNameList ).setResultsName( "columns" ) +
+                   fromToken +
+                   tableNameList.setResultsName( "tables" ) +
+                   Optional( Group( whereToken + whereExpression ), "" ).setResultsName("where") )
+
+simpleSQL = selectStmt
+
+# define Oracle comment format, and ignore them
+oracleSqlComment = "--" + restOfLine
+simpleSQL.ignore( oracleSqlComment )
+
+
+"""
+test( "SELECT * from XYZZY, ABC" )
+test( "select * from SYS.XYZZY" )
+test( "Select A from Sys.dual" )
+test( "Select A,B,C from Sys.dual" )
+test( "Select A, B, C from Sys.dual" )
+test( "Select A, B, C from Sys.dual, Table2   " )
+test( "Xelect A, B, C from Sys.dual" )
+test( "Select A, B, C frox Sys.dual" )
+test( "Select" )
+test( "Select &&& frox Sys.dual" )
+test( "Select A from Sys.dual where a in ('RED','GREEN','BLUE')" )
+test( "Select A from Sys.dual where a in ('RED','GREEN','BLUE') and b in (10,20,30)" )
+test( "Select A,b from table1,table2 where table1.id eq table2.id -- test out comparison operators" )
+test( "Select * from User, RemoteAccount where user._id = user.user_id)" )
+
+
+Test output:
+>pythonw -u simpleSQL.py
+SELECT * from XYZZY, ABC ->
+tokens =  ['select', '*', 'from', ['XYZZY', 'ABC']]
+tokens.columns = *
+tokens.tables = ['XYZZY', 'ABC']
+
+select * from SYS.XYZZY ->
+tokens =  ['select', '*', 'from', ['SYS.XYZZY']]
+tokens.columns = *
+tokens.tables = ['SYS.XYZZY']
+
+Select A from Sys.dual ->
+tokens =  ['select', ['A'], 'from', ['SYS.DUAL']]
+tokens.columns = ['A']
+tokens.tables = ['SYS.DUAL']
+
+Select A,B,C from Sys.dual ->
+tokens =  ['select', ['A', 'B', 'C'], 'from', ['SYS.DUAL']]
+tokens.columns = ['A', 'B', 'C']
+tokens.tables = ['SYS.DUAL']
+
+Select A, B, C from Sys.dual ->
+tokens =  ['select', ['A', 'B', 'C'], 'from', ['SYS.DUAL']]
+tokens.columns = ['A', 'B', 'C']
+tokens.tables = ['SYS.DUAL']
+
+Select A, B, C from Sys.dual, Table2    ->
+tokens =  ['select', ['A', 'B', 'C'], 'from', ['SYS.DUAL', 'TABLE2']]
+tokens.columns = ['A', 'B', 'C']
+tokens.tables = ['SYS.DUAL', 'TABLE2']
+
+Xelect A, B, C from Sys.dual ->
+^
+Expected 'select'
+Expected 'select' (0), (1,1)
+
+Select A, B, C frox Sys.dual ->
+               ^
+Expected 'from'
+Expected 'from' (15), (1,16)
+
+Select ->
+      ^
+Expected '*'
+Expected '*' (6), (1,7)
+
+Select &&& frox Sys.dual ->
+       ^
+Expected '*'
+Expected '*' (7), (1,8)
+
+>Exit code: 0
+"""
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+"""
+This is a simple script to print out potentially redundant indexes in a mongdb instance.
+For example, if an index is defined on {field1:1,field2:1} and there is another index
+with just fields {field1:1}, the latter index is not needed since the first index already
+indexes the necessary fields.
+"""
+from pymongo import Connection
+
+def main():
+    connection = Connection()
+
+    def compute_signature(index):
+        signature = index["ns"]
+        for key in index["key"]:
+            try:
+                signature += "%s_%s" % (key, int(index["key"][key]))
+            except ValueError:
+                signature += "%s_%s" % (key, index["key"][key])
+        return signature
+
+    def report_redundant_indexes(current_db):
+        print "Checking DB: %s" % current_db.name
+        indexes = current_db.system.indexes.find()
+        index_map = {}
+        for index in indexes:
+            signature = compute_signature(index)
+            index_map[signature] = index
+
+        for signature in index_map.keys():
+            for other_sig in index_map.keys():
+                if signature == other_sig:
+                    continue
+                if other_sig.startswith(signature):
+                    print "Index %s[%s] may be redundant with %s[%s]" % (
+                        index_map[signature]["ns"],
+                        index_map[signature]["name"],
+                        index_map[other_sig]["ns"],
+                        index_map[other_sig]["name"])
+
+    for db in connection.database_names():
+        report_redundant_indexes(connection[db])
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,29 @@
+from setuptools import setup, find_packages
+
+version='0.1'
+
+packages = find_packages(exclude=['ez_setup', 'examples', 'tests'])
+print packages
+setup(
+    name='mongodbtools',
+    version=version,
+    description='Python tools for working with MongoDB',
+    author='Jason Wilder',
+    author_email='code@jasonwilder.com',
+    maintainer='Jason Wilder',
+    license='MIT',
+    url='http://github.com/jwilder/mongodb-tools',
+    packages=packages,
+    entry_points = """\
+    [console_scripts]
+    collection-stats=mongodbtools.collection_stats:main
+    index-stats=mongodbtools.index_stats:main
+    redundant-indexes=mongodbtools.redundant_indexes:main
+    """,
+    install_requires=[
+        'pymongo>=2.1',
+        'PrettyTable',
+        'psutil==0.3.0',
+        'mongoengine==0.5.0'
+    ],
+)