[Erp5-report] r37681 nicolas - in /erp5/trunk/utils/erp5.utils.sql_dump_splitter: ./ src/ s...

Wed Aug 11 10:02:25 CEST 2010

Author: nicolas
Date: Wed Aug 11 10:02:22 2010
New Revision: 37681

URL: http://svn.erp5.org?rev=37681&view=rev
Log:
Utility usefull to split large dump of mysql into 
multiple files.
beta version, still in development.

Added:
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/
    erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py

Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt?rev=37681&view=auto
==============================================================================

--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,3 @@
+2010/08/11 Nicolas Delaby
+-------------------------
+Initial version
\ No newline at end of file

Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,27 @@
+sql_dump_splitter:
+Split SQL dump produced by mysqldump with following options:
+  --single-transaction --no-autocommit --opt
+options are -w for working directory, where created folders and splitted file will
+be located
+            -m max insertion per file
+
+This script will produce at least per table found in the dump file,
+- One table structure.
+- As much as needed files which contains (-m parameter value) max insert lines.
+
+as example for table `foo`:
+  ./foo/foo_table_strcuture.sql
+       /foo_0000.sql
+       /foo_0001.sql
+       /...
+
+sql_dump_importer:
+Utility to import into mysql directory strcuture produced by sql_dump_splitter.
+options are -H host
+            -u user
+            -P port
+            -p password
+            -b mysql binary path
+mandatory paramters are: database_id and working_directory
+
+

Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from setuptools import setup, find_packages
+import sys, os
+
+version = '0.0.1b'
+name = 'erp5.utils.sql_dump_splitter'
+
+def read(name):
+    return open(name).read()
+
+long_description=(
+        read('README.txt')
+        + '\n' +
+        read('CHANGES.txt')
+    )
+
+setup(name=name,
+      version=version,
+      description="Utility to split large dumps of SQL database into multiple"\
+      " files. Usefull when dumps are too large to be imported in single"\
+      " transaction.",
+      long_description=long_description,
+      classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: System Administrators',
+        'License :: OSI Approved :: GNU General Public License (GPL)',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python',
+        'Topic :: Database :: Database Engines/Servers',
+        'Topic :: System :: Archiving',
+        'Topic :: Utilities',
+        ],
+      keywords='SQL mysqldump splitter',
+      author='Nicolas Delaby',
+      author_email='nicolas at nexedi.com',
+      url='http://www.erp5.org/',
+      package_dir={'':'src'},
+      packages=find_packages('src'),
+      namespace_packages=['erp5', 'erp5.utils'],
+      include_package_data=True,
+      entry_points = {
+        'console_scripts' : ['sql_dump_splitter = erp5.utils.sql_dump_splitter:sql_dump_splitter',
+                             'sql_dump_importer = erp5.utils.sql_dump_splitter:sql_dump_importer']
+        }
+      ) 

Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,7 @@
+# See http://peak.telecommunity.com/DevCenter/setuptools#namespace-packages
+try:
+   __import__('pkg_resources').declare_namespace(__name__)
+except ImportError:
+  from pkgutil import extend_path
+  __path__ = extend_path(__path__, __name__) 
+

Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,7 @@
+# See http://peak.telecommunity.com/DevCenter/setuptools#namespace-packages
+try:
+   __import__('pkg_resources').declare_namespace(__name__)
+except ImportError:
+  from pkgutil import extend_path
+  __path__ = extend_path(__path__, __name__) 
+

Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,353 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+#                    Nicolas Delaby <nicolas at nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#
+##############################################################################
+import os
+import shutil
+import sys
+import re
+import tempfile
+from datetime import datetime
+from glob import glob
+from subprocess import Popen, PIPE, STDOUT
+import shlex
+
+_MARKER = []
+
+from optparse import OptionParser
+
+mysql_dump_start_flag = '-- MySQL dump'
+global_variables_flag = '/*!40'
+table_structure_start_regex = re.compile('-- Table structure for table'\
+                                         ' `(?P<table_name>\S+)`')
+drop_table_command_flag = 'DROP TABLE IF EXISTS `%s`;'
+dumping_data_start_flag = '-- Dumping data for table `%s`'
+insert_command_flag = 'INSERT INTO `%s` VALUES ('
+insert_separator = '),('
+commit_flag = 'commit;'
+end_of_insertion_regex = re.compile('(.*\));\s?(/\*!)')
+
+table_structure_prefix = '_table_structure.sql'
+
+class TablePoolFileHandler(object):
+  """Class usefull to handle
+  all files associated to a table.
+  """
+  def __init__(self, table_name, max_allowed_insert, working_directory,
+               header_data_list, insert_separator):
+    self._table_name = table_name
+    self._max_allowed_insert = max_allowed_insert
+    self._working_directory = working_directory.rstrip(os.path.sep)\
+                                                     + os.path.sep + table_name
+    self._header_data_list = header_data_list[:]
+    self.countdown = max_allowed_insert
+    self.current_file_path = None
+    self.drop_and_create_table_data_list = []
+    self.before_insert_data_list = []
+    self.after_insert_data_list = []
+    self.created_file_path_list = []
+    self.insert_separator = insert_separator
+    if os.path.isdir(self._working_directory):
+      shutil.rmtree(self._working_directory)
+    os.mkdir(self._working_directory)
+
+  def _serialiseCreateTableFile(self, footer_data_list):
+    """Write Create a file with DROP TABLE IF EXISTS `table_name`;
+    ...
+    CREATE TABLE ....
+    In separate file
+    """
+    destination_file_path = '%s%s%s%s' % (self._working_directory,
+                                          os.path.sep, self._table_name,
+                                          table_structure_prefix,)
+    self._warn('creating %s' % destination_file_path)
+    file_object = open(destination_file_path, 'w')
+    for line in self._header_data_list +\
+                self.drop_and_create_table_data_list +\
+                footer_data_list:
+      file_object.write(line + '\n')
+    file_object.close()
+
+  def appendTableStructureData(self, data):
+    """
+    """
+    self.drop_and_create_table_data_list.append(data)
+
+  def appendBeforeInsertDataList(self, data):
+    """LOCK TABLES `table_name` WRITE;
+    /*!40000 ALTER TABLE `table_name` DISABLE KEYS */;
+    set autocommit=0;
+    INSERT INTO `table_name` VALUES 
+    """
+    self.before_insert_data_list.append(data)
+
+  def _closeInsertionFile(self):
+    self._warn('close file: %r' % self.current_file_path)
+    file_object = open(self.current_file_path, 'a')
+    #offset = file_object.tell()
+    #file_object.truncate(offset - 2) # remove ",("
+    file_object.write(');\n')
+    file_object.close()
+    self.current_file_path = None
+
+  def appendInsertionData(self, data):
+    """receive "('aa', 'b', 'c')" string.
+    Append it to the right file and create a new one if
+    the current one is full
+    """
+    if not data:
+      return
+    if not self.countdown:
+      # Force creation of a new file
+      self._closeInsertionFile()
+      # then restart countdown
+      self.countdown = self._max_allowed_insert
+
+    if self.current_file_path is None:
+      # Create a new file
+      i = len(self.created_file_path_list)
+      destination_file_path = '%s%s%s_%04d.sql' % (self._working_directory,
+                                                 os.path.sep,
+                                                 self._table_name, i)
+      self._warn('new file %r' % destination_file_path)
+      file_object = open(destination_file_path, 'w')
+      file_object.write('\n'.join(self._header_data_list +\
+                                  self.before_insert_data_list))
+      file_object.close()
+      self.current_file_path = destination_file_path
+      self.created_file_path_list.append(destination_file_path)
+      # it's a new file so remove "),(" in begining of data
+      data = data.lstrip(self.insert_separator)
+    file_object = open(self.current_file_path, 'a')
+    file_object.write(data)
+    file_object.close()
+    self.countdown -= 1
+
+  def appendAfterInsertionData(self, data):
+    """/*!40000 ALTER TABLE `table_name` ENABLE KEYS */;
+       UNLOCK TABLES;
+       commit;
+    """
+    self.after_insert_data_list.append(data)
+
+  def serialise(self, footer_data_list):
+    """Create all files in working directory
+    """
+    if self.countdown and self.current_file_path:
+      # Close last file if exists because countdown
+      # is not finished
+      self._closeInsertionFile()
+    self._serialiseCreateTableFile(footer_data_list)
+    for path in self.created_file_path_list:
+      file_object = open(path, 'a')
+      file_object.write('\n'.join(self.after_insert_data_list + footer_data_list))
+      file_object.close()
+
+  def _warn(self, message):
+    """
+    """
+    print self._table_name, message
+
+def fileReader(file_object):
+  """generator to read chunks of 16384 octets
+  of provided file_object:
+  """
+  while True:
+    data = file_object.read(2<<13)
+    if not data:
+      raise StopIteration
+    yield data
+
+def sql_dump_splitter():
+  usage = "usage: %prog dump_sql"
+  parser = OptionParser(usage=usage)
+  parser.add_option('-m', '--max_allowed_inserts_by_commit',
+                    dest='max_allowed_insert', type="long", default=10000L)
+  parser.add_option('-w', '--working_directory',
+                    dest='working_directory', default='/tmp')
+  (options, args) = parser.parse_args()
+
+  if len(args) != 1 :
+    print parser.print_help()
+    parser.error('incorrect number of arguments')
+  dump_sql_path = args[0]
+
+  dump_file_object = open(dump_sql_path, 'r')
+  max_allowed_insert = options.max_allowed_insert
+  working_directory = options.working_directory
+
+  read_valid_sql_dump = False
+  reading_global_variable = True
+  footer_list = []
+  handler_list = []
+  global_variable_data_list = [] # to append in each file
+  current_table_name = ''
+  dumping_started = False
+  pre_command_dumping_started = False
+  post_command_dumping_started = False
+  insert_started = False
+  count_down = max_allowed_insert
+  last_not_insert = ''
+  between_two_table = False
+  for chunk in fileReader(dump_file_object):
+    #print 'len chunk%r' % len(chunk)
+    if chunk[:len(mysql_dump_start_flag)] == mysql_dump_start_flag:
+      read_valid_sql_dump = True
+    if not read_valid_sql_dump:
+      raise ValueError('SQL Dump not recognized')
+    for line in chunk.splitlines():
+      # for each lines except last one
+      if reading_global_variable:
+        global_variable_data_list.append(line)
+      if not current_table_name:
+        # If there is no table name, always try to
+        # discover a new one
+        match_object = table_structure_start_regex.search(line)
+        if match_object is not None:
+          current_table_name = match_object.group('table_name')
+          handler = TablePoolFileHandler(current_table_name,
+                                         max_allowed_insert,
+                                         working_directory,
+                                         global_variable_data_list,
+                                         insert_separator)
+          handler_list.append(handler)
+          print 'Find new table:%r' % current_table_name
+          reading_global_variable = False
+          between_two_table = False
+      if current_table_name:
+        if not dumping_started and not post_command_dumping_started:
+          # Drop table if exists parts
+          handler.appendTableStructureData(line)
+        if line == dumping_data_start_flag % current_table_name:
+          # Dumping Starting here.
+          dumping_started = True
+        if not insert_started and not between_two_table:
+          insert_command = insert_command_flag % current_table_name
+          if line[:len(insert_command)] == insert_command:
+            # INSERT INTO `table_name` VALUES 
+            insert_started = True
+            handler.appendBeforeInsertDataList(insert_command)
+            line = line[len(insert_command):] # remove INSERT INTO parts
+          if dumping_started and not post_command_dumping_started and\
+             not insert_started:
+            handler.appendBeforeInsertDataList(line + '\n')
+        if insert_started:
+          # get all Values (...),(...),(...),
+          local_line = last_not_insert + line
+          unique_insert_list = local_line.split(insert_separator)
+          first_insert = unique_insert_list.pop(0)
+          if unique_insert_list:
+            # it means first is not unique and not truncated
+            handler.appendInsertionData(insert_separator + first_insert)
+            last_not_insert = unique_insert_list.pop() # remove last which
+                                                       # could be truncated
+            [handler.appendInsertionData(insert_separator + insert)\
+                                         for insert in unique_insert_list[:-1]]
+            if unique_insert_list:
+              data = insert_separator + unique_insert_list[-1].rstrip(')')
+              handler.appendInsertionData(data)
+          else:
+            last_not_insert = first_insert
+          if ');' in last_not_insert:
+            # Condition just to minimize usage of regex
+            # which is a very inefficient regex
+            match_object = end_of_insertion_regex.search(last_not_insert)
+            if match_object is not None:
+              handler.appendInsertionData(insert_separator +\
+                                          match_object.group(1).rstrip(')'))
+              last_not_insert = ''
+              post_command_dumping_started = True
+              insert_started = False
+              dumping_started = False
+        if post_command_dumping_started:
+          handler.appendAfterInsertionData(line + '\n')
+        if line == commit_flag:
+          # Usefull to detect end of dump.
+          # a new table can follow
+          print 'End of dump for table:%r' % current_table_name
+          current_table_name = ''
+          dumping_started = False
+          insert_started = False
+          post_command_dumping_started = False
+          between_two_table = True
+      if between_two_table and line.strip() and line not in ('--', 'commit;',):
+        footer_list.append(line)
+
+  for handler in handler_list:
+    handler.serialise(footer_list)
+  print 'Done'
+
+
+def sql_dump_importer():
+  """Function able to parse directory structure
+  created by sql_dump_splitter
+  """
+  usage = "usage: %prog [options] database working_directory"
+  parser = OptionParser(usage=usage)
+  parser.add_option('-b', '--binary',
+                    dest='binary', default='mysql')
+  parser.add_option('-u', '--user',
+                    dest='user', default='root')
+  parser.add_option('-p', '--password',
+                    dest='password', default='')
+  parser.add_option('-H', '--host',
+                    dest='host', default='localhost')
+  parser.add_option('-P', '--port',
+                    dest='port', default='3306')
+  (options, args) = parser.parse_args()
+
+  if len(args) != 2 :
+    print parser.print_help()
+    parser.error('incorrect number of arguments')
+  database = args[0]
+  working_directory = args[1]
+
+  command_string = '%s -u %s -h %s -P %s %s'  % (options.binary,
+                                                 options.user,
+                                                 options.host,
+                                                 options.port,
+                                                 database)
+  if options.password:
+    command_string += '-p %s' % options.password
+
+  def runCommandForFileList(command,  file_list):
+    for filename in file_list:
+      print 'Running:%r' % (command_string + ' < ' + filename)
+      process = Popen(shlex.split(command_string), stdin=open(filename, 'r'),
+                                  stdout=PIPE, stderr=PIPE)
+      stdout, stderr = process.communicate()
+      if stderr:
+        raise ValueError('SQL Import failed:\n%r' % stderr)
+  # first run create tables;
+  filename_list = glob(working_directory.rstrip(os.sep)+'/*/*%s'% table_structure_prefix)
+  runCommandForFileList(command_string, filename_list)
+  filename_dump_list = [filepath for filepath in \
+                        glob(working_directory.rstrip(os.sep)+'/*/*.sql')\
+                        if table_structure_prefix not in filepath]
+  filename_dump_list.sort()
+  # then import files which are containing data
+  runCommandForFileList(command_string, filename_dump_list)