zeph
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎LICENSE.txt
+674 b/‎LICENSE.txt
+674
diff --git a/‎MANIFEST.in
+4 b/‎MANIFEST.in
+4
diff --git a/‎README.md
+104 b/‎README.md
+104
diff --git a/‎bin/sqlhbase-mysqlimport
+65 b/‎bin/sqlhbase-mysqlimport
+65
diff --git a/‎bin/sqlhbase-populate
+64 b/‎bin/sqlhbase-populate
+64
diff --git a/‎pytest.ini
+6 b/‎pytest.ini
+6
diff --git a/‎setup.py
+22 b/‎setup.py
+22
diff --git a/‎sqlhbase/__init__.py b/‎sqlhbase/__init__.py
diff --git a/‎sqlhbase/borgs/__init__.py
+12 b/‎sqlhbase/borgs/__init__.py
+12
diff --git a/‎sqlhbase/borgs/hbase.py
+20 b/‎sqlhbase/borgs/hbase.py
+20
diff --git a/‎sqlhbase/borgs/hive.py
+37 b/‎sqlhbase/borgs/hive.py
+37
@@ -1 +1,5 @@
 .idea
+*/*.pyc
+MANIFEST
+dist
+build
@@ -0,0 +1,4 @@
+include LICENSE.txt
+include README.md
+recursive-include sqlhbase *.py
+recursive-exclude sqlhbase *.pyc
@@ -0,0 +1,104 @@
+========
+SqlHBase
+========
+
+SqlHBase is an HBase ingestion tool for MySQL generated dumps
+
+The aim of this tool is to provide a 1:1 mapping of a MySQL table
+into an HBase table, mapped on Hive (schema is handled too)
+
+To run this requires a working HBase with Thrift enabled,
+and a Hive instance, with metastore properly configured and
+Thrift enabled as well. If u need I/O performance, I recommend to
+look into Pig or Jython, or directly a native Map Reduce job.
+
+SQOOP was discarded as an option, as it doesn't cope with dump files
+and it does not compute the difference between dumps before ingestion.
+
+SqlHBase does a 2 level ingestion process, described below.
+
+"INSERT INTO `table_name` VALUE (), ()" statements are hashed
+and stored (dropping anything at the left side of the first open
+round bracket) as a single row into a staging table on HBase (the
+md5 hash of the row is the row_key on HBase).
+When multiple dumps of the same table/database are inserted, this
+prevents (or at least reduce) the duplication of data on HBase side.
+
+MySQL by default chunks rows as tuples, up to 16Mb, in a single
+INSERT statement. Given that, we basically have a list of tuples:
+
+    [(1, "c1", "c2", "c3"), (2, "c1", "c2", "c3"), ... ]
+
+Initial attempt of parsing/splitting such a string with a regexp
+failed, of course. Since a column value could contain ANYTHING,
+even round brackets and quotes. This kind of language is not
+recognizable by a Finite State Automata, so something else had to
+be implemented, to keep track of the nested brackets for example.
+A PDA (push down automata) would have helped but... as u can
+look above, the syntax is exactly the one from a list of tuples
+in python.... an eval() is all we needed in such a case.
+(and it is also, I guess, optimized on C level by the interpreter)
+
+To be taken in consideration that the IDs of the rows are integers
+while HBase wants a string... plus, we need to do some zero padding
+due to the fact that HBase does lexicographic sorting of its keys.
+
+There are tons of threads on forums about how bad is to use a
+monotonically incrementing key on HBase, but... this is what we needed.
+
+[...]
+
+A 2-level Ingestion Process
+===========================
+
+A staging,     -> (bin/sqlhbase-mysqlimport)
+--------------------------------------------
+without any kind of interpretation of the content of the MySQL dump
+file apart of the splitting between schema data and raw data (INSERTs).
+2 tables are created _"namespace"_creates, _"namespace"_values
+The first table contains an entry/row for each dumpfile ingested,
+having as a rowkey the timestamp of the day at the bottom of the dumpfile
+(or a command line provided one, in case that information is missing).
+Such row contains the list of hashes that for a table (see below),
+a create statement for each table, and a create statement for each view,
+plus some statistics related to the time of parsing of the file,
+and the amount of rows it was containing, and the overall md5 hash.
+
+A publishing,  -> (bin/sqlhbase-populate)
+-----------------------------------------
+given a namespace (as of initial import) and a timestamp (from a list):
+ - the content of the table CREATE statement gets interpreted, the data
+   types mapped from MySQL to HIVE, and the table created on HIVE.
+ - if not existing, the table gets created fully, reading each 16Mb chunk
+ - the table gets created with such convention: "namespace"_"table_name"
+ - if the table exists, and it contains data, we compute the difference
+   between the 2 lists of hashes that were created at ingestion time
+ -- then we check what has already been ingested in the range of row ids
+    which is contained in the mysql chunk (we took the assumption that
+    mysql is sequentially dumping a table, hopefully)
+ -- if a row id which is in the sequence in the database is not in the
+    sequence from the chunk we are ingesting, than we might have a DELETE
+    (DELETE that we do not execute on HBase due to HBASE-5154, HBASE-5241)
+ -- if a row id is also in our chunk, we check each column for changes
+ -- duplicated columns are removed from the list that is going to be sent
+    to the server, this to avoid waste of bandwidth consumption
+ - at this stage, we get a copy of the data on the next known ingestion
+   date (dates are known from the list of dumps in the meta table)
+ -- if data are found, each row gets diffed with the data to be ingested
+    that are left from the previous cleaning... if there are real changes
+    those are kept and will be sent to the HBase server for writing
+    (timestamps are verified at this stage, to avoid to resend data
+    that have already been written previously)
+
+FIXME: ingesting data, skipping a day, will need proper recalculation
+       of the difference of the hashes list...
+       ingesting data, from a backup that was not previously ingested
+       (while we kept ingesting data in the tables) will cause some
+       redundant data duplicated in HBase, simply cause we do not dare
+       to delete the duplicate that are "in the future"
+
+      ...anyway, it is pretty easy to delete a table and reconstruct it
+      having all the history into the staging level of HBase
+
+Last but not least, we do parse VIEWs and apply them on HIVE
+... be careful about https://issues.apache.org/jira/browse/HIVE-2055 !!!
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import getopt
+import sys
+from datetime import datetime
+from sqlhbase.mysql import MySQLDump
+
+def usage():
+    print """
+USAGE:
+$ tar xOf 01-20-00_all_databases.tar bob_live_hk.sql.gz | zcat | ...
+ or
+$ unzip -p production_sg_DAY_20130121.zip | ...
+
+then ...
+$ echo $SQL|ri-fastdump2hbase -d INDFAS
+
+ADDONs:
+"skip tables option" -s, --skiptable=
+ -s table1,table2,table3
+# tables to absolutely avoid to ingest, like "alice_message"
+
+"forced_timestamp" -t, --timestamp=
+ -t 2013-01-20
+# in case we have no timestamp at the bottom of the dump
+"""
+
+if len(sys.argv) < 3:
+    usage()
+    sys.exit(2)
+
+try:
+    opts, args = getopt.getopt(sys.argv[1:], "d:f:s:t:", ["db=", "sqlfile=", "skiptable=", "timestamp="])
+except getopt.GetoptError as err:
+    # print help information and exit:
+    print str(err) # will print something like "option -a not recognized"
+    usage()
+    sys.exit(2)
+
+input_db = ""
+sql_file = None
+skip_tables = []
+forced_timestamp = ""
+for opt, arg in opts:
+    if opt in ('-d', '--db'):
+        input_db = arg
+    elif opt in ('-f', '--sqlfile'):
+        sql_file = arg
+    elif opt in ('-s', '--skiptable'):
+        skip_tables = arg.split(",")
+    elif opt in ('-t', '--timestamp'):
+        forced_timestamp = arg
+
+print 'DB>', input_db
+
+# get the argument file (uncompressed MySQL dump)
+if sql_file is not None:
+    f = open(sql_file)
+    print MySQLDump(open(sql_file), input_db, skip_tables, forced_timestamp)
+    f.close()
+    sys.exit(0)
+
+# ... or read what they pipe me into stdin
+print MySQLDump(sys.stdin, input_db, skip_tables, forced_timestamp)
+
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+__author__ = 'zeph'
+import getopt
+import sys
+from sqlhbase.intake import HBaseIntake
+
+def usage():
+    print """
+USAGE:
+$ ri-parseintake -d INDFAS -t 1234567890
+
+ADDONS: -i file_with_list_of_tables_to_INCLUDE
+        -e file_with_list_of_tables_to_EXCLUDE
+        -l just list the tables available
+"""
+
+if len(sys.argv) < 3:
+    usage()
+
+try:
+    opts, args = getopt.getopt(sys.argv[1:],
+        "d:t:i:e:l", ["db=", "timestamp=", "include=", "exclude=", "listing="])
+except getopt.GetoptError as err:
+    # print help information and exit:
+    print str(err) # will print something like "option -a not recognized"
+    usage()
+    sys.exit(2)
+
+input_db = ""
+sql_file = None
+timestamp = ""
+include_list = ""
+exclude_list = ""
+listing = False
+for opt, arg in opts:
+    if opt in ('-d', '--db'):
+        input_db = arg
+    elif opt in ('-t', '--timestamp'):
+        timestamp = arg
+    elif opt in ('-i', '--include'):
+        include_list = arg
+    elif opt in ('-e', '--exclude'):
+        exclude_list = arg
+    elif opt in ('-l', '--listing'):
+        listing = True
+
+hbase = HBaseIntake(input_db)
+try: hbase.connect()
+except:
+    print hbase.get_namespaces()
+    sys.exit(2)
+
+print 'DB>', input_db
+if timestamp == "":
+    ava = hbase.get_dumps()
+    for day in ava: print day
+    print "DAY(DUMP)s AVAILABLE>",len(ava)
+    #print >> sys.stderr, hbase.prettify()
+    sys.exit(2)
+elif listing:
+    print "\n".join(hbase.cls_parser().get_tables(timestamp))
+else: print hbase.parse(timestamp, exclude_list, include_list)
+
@@ -0,0 +1,6 @@
+# content of pytest.ini
+[pytest]
+addopts = --doctest-modules
+
+
+# src: http://pytest.org/latest/doctest.html
@@ -0,0 +1,22 @@
+__author__ = 'zeph'
+
+from distutils.core import setup
+from setuptools import setup
+
+setup(
+    name='SqlHBase',
+    version='0.4',
+    author='Guido Serra aka Zeph',
+    author_email='[email protected]',
+    url='https://github.com/zeph/sqlhbase',
+    packages=['sqlhbase',],
+    scripts=['bin/sqlhbase-mysqlimport','bin/sqlhbase-populate'],
+    license='GPL 3',
+    description='MySQLDump to HBase, ETL scripts',
+    long_description=open('README.md').read(),
+    install_requires=[
+        "happybase >= 0.4",
+        "hive-thrift-py == 0.0.1",
+    ],
+    include_package_data = True,
+)
@@ -0,0 +1,12 @@
+__author__ = 'zeph'
+CLUSTER_HOST = 'localhost'
+
+import os
+if os.environ.get('CLUSTER_HOST') is not None:
+    CLUSTER_HOST = os.environ.get('CLUSTER_HOST')
+
+# http://code.activestate.com/recipes/66531/
+class Borg:
+    __shared_state = {}
+    def __init__(self):
+        self.__dict__ = self.__shared_state
@@ -0,0 +1,20 @@
+__author__ = 'zeph'
+
+from happybase import Connection
+from sqlhbase.borgs import *
+
+class HBase(Borg):
+
+    _db = {} # all the connections
+
+    def link(self, ns):
+        if ns not in self._db:
+            try:
+                self._db[ns] = Connection(CLUSTER_HOST, table_prefix=ns)
+            except Exception, e:
+                print e
+                print 'export CLUSTER_HOST="yourserver-hostname", please'
+        return self._db[ns]
+
+    def __str__(self):
+        return id(self._db)
@@ -0,0 +1,37 @@
+__author__ = 'zeph'
+THRIFT_PORT = 10000
+
+from hive_service import ThriftHive
+from hive_service.ttypes import HiveServerException
+from thrift import Thrift
+from thrift.transport import TSocket
+from thrift.transport import TTransport
+from thrift.protocol import TBinaryProtocol
+from sqlhbase.borgs import *
+
+class Hive(Borg):
+
+    _db = None # DB connection
+
+    def link(self):
+        if self._db is None:
+            transport = TSocket.TSocket(CLUSTER_HOST, THRIFT_PORT)
+            transport = TTransport.TBufferedTransport(transport)
+            protocol = TBinaryProtocol.TBinaryProtocol(transport)
+            self._db = ThriftHive.Client(protocol)
+            transport.open()
+        return self._db
+
+    def __str__(self):
+        return id(self._db)
+
+    def exec_stmt(self, sql):
+        try:
+            self._db.execute(sql)
+            return self._db.fetchAll()
+
+        except Thrift.TException, tx:
+            print '%s' % (tx.message)
+
+        except HiveServerException:
+            print "HiveServerException>", sql.strip()
-Original file line number
+Diff line change
 .idea
 +*/*.pyc
 +MANIFEST
 +dist
 +build