[Erp5-report] r37681 nicolas - in /erp5/trunk/utils/erp5.utils.sql_dump_splitter: ./ src/ s...
nobody at svn.erp5.org
nobody at svn.erp5.org
Wed Aug 11 10:02:25 CEST 2010
Author: nicolas
Date: Wed Aug 11 10:02:22 2010
New Revision: 37681
URL: http://svn.erp5.org?rev=37681&view=rev
Log:
Utility usefull to split large dump of mysql into
multiple files.
beta version, still in development.
Added:
erp5/trunk/utils/erp5.utils.sql_dump_splitter/
erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt
erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt
erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/
erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py
Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/CHANGES.txt [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,3 @@
+2010/08/11 Nicolas Delaby
+-------------------------
+Initial version
\ No newline at end of file
Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/README.txt [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,27 @@
+sql_dump_splitter:
+Split SQL dump produced by mysqldump with following options:
+ --single-transaction --no-autocommit --opt
+options are -w for working directory, where created folders and splitted file will
+be located
+ -m max insertion per file
+
+This script will produce at least per table found in the dump file,
+- One table structure.
+- As much as needed files which contains (-m parameter value) max insert lines.
+
+as example for table `foo`:
+ ./foo/foo_table_strcuture.sql
+ /foo_0000.sql
+ /foo_0001.sql
+ /...
+
+sql_dump_importer:
+Utility to import into mysql directory strcuture produced by sql_dump_splitter.
+options are -H host
+ -u user
+ -P port
+ -p password
+ -b mysql binary path
+mandatory paramters are: database_id and working_directory
+
+
Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/setup.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from setuptools import setup, find_packages
+import sys, os
+
+version = '0.0.1b'
+name = 'erp5.utils.sql_dump_splitter'
+
+def read(name):
+ return open(name).read()
+
+long_description=(
+ read('README.txt')
+ + '\n' +
+ read('CHANGES.txt')
+ )
+
+setup(name=name,
+ version=version,
+ description="Utility to split large dumps of SQL database into multiple"\
+ " files. Usefull when dumps are too large to be imported in single"\
+ " transaction.",
+ long_description=long_description,
+ classifiers=[
+ 'Development Status :: 4 - Beta',
+ 'Intended Audience :: System Administrators',
+ 'License :: OSI Approved :: GNU General Public License (GPL)',
+ 'Operating System :: OS Independent',
+ 'Programming Language :: Python',
+ 'Topic :: Database :: Database Engines/Servers',
+ 'Topic :: System :: Archiving',
+ 'Topic :: Utilities',
+ ],
+ keywords='SQL mysqldump splitter',
+ author='Nicolas Delaby',
+ author_email='nicolas at nexedi.com',
+ url='http://www.erp5.org/',
+ package_dir={'':'src'},
+ packages=find_packages('src'),
+ namespace_packages=['erp5', 'erp5.utils'],
+ include_package_data=True,
+ entry_points = {
+ 'console_scripts' : ['sql_dump_splitter = erp5.utils.sql_dump_splitter:sql_dump_splitter',
+ 'sql_dump_importer = erp5.utils.sql_dump_splitter:sql_dump_importer']
+ }
+ )
Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/__init__.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,7 @@
+# See http://peak.telecommunity.com/DevCenter/setuptools#namespace-packages
+try:
+ __import__('pkg_resources').declare_namespace(__name__)
+except ImportError:
+ from pkgutil import extend_path
+ __path__ = extend_path(__path__, __name__)
+
Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/__init__.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,7 @@
+# See http://peak.telecommunity.com/DevCenter/setuptools#namespace-packages
+try:
+ __import__('pkg_resources').declare_namespace(__name__)
+except ImportError:
+ from pkgutil import extend_path
+ __path__ = extend_path(__path__, __name__)
+
Added: erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py
URL: http://svn.erp5.org/erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py?rev=37681&view=auto
==============================================================================
--- erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py (added)
+++ erp5/trunk/utils/erp5.utils.sql_dump_splitter/src/erp5/utils/sql_dump_splitter/__init__.py [utf8] Wed Aug 11 10:02:22 2010
@@ -0,0 +1,353 @@
+# -*- coding: utf-8 -*-
+##############################################################################
+#
+# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
+# Nicolas Delaby <nicolas at nexedi.com>
+#
+# WARNING: This program as such is intended to be used by professional
+# programmers who take the whole responsability of assessing all potential
+# consequences resulting from its eventual inadequacies and bugs
+# End users who are looking for a ready-to-use solution with commercial
+# garantees and support are strongly adviced to contract a Free Software
+# Service Company
+#
+# This program is Free Software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+##############################################################################
+import os
+import shutil
+import sys
+import re
+import tempfile
+from datetime import datetime
+from glob import glob
+from subprocess import Popen, PIPE, STDOUT
+import shlex
+
+_MARKER = []
+
+from optparse import OptionParser
+
+mysql_dump_start_flag = '-- MySQL dump'
+global_variables_flag = '/*!40'
+table_structure_start_regex = re.compile('-- Table structure for table'\
+ ' `(?P<table_name>\S+)`')
+drop_table_command_flag = 'DROP TABLE IF EXISTS `%s`;'
+dumping_data_start_flag = '-- Dumping data for table `%s`'
+insert_command_flag = 'INSERT INTO `%s` VALUES ('
+insert_separator = '),('
+commit_flag = 'commit;'
+end_of_insertion_regex = re.compile('(.*\));\s?(/\*!)')
+
+table_structure_prefix = '_table_structure.sql'
+
+class TablePoolFileHandler(object):
+ """Class usefull to handle
+ all files associated to a table.
+ """
+ def __init__(self, table_name, max_allowed_insert, working_directory,
+ header_data_list, insert_separator):
+ self._table_name = table_name
+ self._max_allowed_insert = max_allowed_insert
+ self._working_directory = working_directory.rstrip(os.path.sep)\
+ + os.path.sep + table_name
+ self._header_data_list = header_data_list[:]
+ self.countdown = max_allowed_insert
+ self.current_file_path = None
+ self.drop_and_create_table_data_list = []
+ self.before_insert_data_list = []
+ self.after_insert_data_list = []
+ self.created_file_path_list = []
+ self.insert_separator = insert_separator
+ if os.path.isdir(self._working_directory):
+ shutil.rmtree(self._working_directory)
+ os.mkdir(self._working_directory)
+
+ def _serialiseCreateTableFile(self, footer_data_list):
+ """Write Create a file with DROP TABLE IF EXISTS `table_name`;
+ ...
+ CREATE TABLE ....
+ In separate file
+ """
+ destination_file_path = '%s%s%s%s' % (self._working_directory,
+ os.path.sep, self._table_name,
+ table_structure_prefix,)
+ self._warn('creating %s' % destination_file_path)
+ file_object = open(destination_file_path, 'w')
+ for line in self._header_data_list +\
+ self.drop_and_create_table_data_list +\
+ footer_data_list:
+ file_object.write(line + '\n')
+ file_object.close()
+
+ def appendTableStructureData(self, data):
+ """
+ """
+ self.drop_and_create_table_data_list.append(data)
+
+ def appendBeforeInsertDataList(self, data):
+ """LOCK TABLES `table_name` WRITE;
+ /*!40000 ALTER TABLE `table_name` DISABLE KEYS */;
+ set autocommit=0;
+ INSERT INTO `table_name` VALUES
+ """
+ self.before_insert_data_list.append(data)
+
+ def _closeInsertionFile(self):
+ self._warn('close file: %r' % self.current_file_path)
+ file_object = open(self.current_file_path, 'a')
+ #offset = file_object.tell()
+ #file_object.truncate(offset - 2) # remove ",("
+ file_object.write(');\n')
+ file_object.close()
+ self.current_file_path = None
+
+ def appendInsertionData(self, data):
+ """receive "('aa', 'b', 'c')" string.
+ Append it to the right file and create a new one if
+ the current one is full
+ """
+ if not data:
+ return
+ if not self.countdown:
+ # Force creation of a new file
+ self._closeInsertionFile()
+ # then restart countdown
+ self.countdown = self._max_allowed_insert
+
+ if self.current_file_path is None:
+ # Create a new file
+ i = len(self.created_file_path_list)
+ destination_file_path = '%s%s%s_%04d.sql' % (self._working_directory,
+ os.path.sep,
+ self._table_name, i)
+ self._warn('new file %r' % destination_file_path)
+ file_object = open(destination_file_path, 'w')
+ file_object.write('\n'.join(self._header_data_list +\
+ self.before_insert_data_list))
+ file_object.close()
+ self.current_file_path = destination_file_path
+ self.created_file_path_list.append(destination_file_path)
+ # it's a new file so remove "),(" in begining of data
+ data = data.lstrip(self.insert_separator)
+ file_object = open(self.current_file_path, 'a')
+ file_object.write(data)
+ file_object.close()
+ self.countdown -= 1
+
+ def appendAfterInsertionData(self, data):
+ """/*!40000 ALTER TABLE `table_name` ENABLE KEYS */;
+ UNLOCK TABLES;
+ commit;
+ """
+ self.after_insert_data_list.append(data)
+
+ def serialise(self, footer_data_list):
+ """Create all files in working directory
+ """
+ if self.countdown and self.current_file_path:
+ # Close last file if exists because countdown
+ # is not finished
+ self._closeInsertionFile()
+ self._serialiseCreateTableFile(footer_data_list)
+ for path in self.created_file_path_list:
+ file_object = open(path, 'a')
+ file_object.write('\n'.join(self.after_insert_data_list + footer_data_list))
+ file_object.close()
+
+ def _warn(self, message):
+ """
+ """
+ print self._table_name, message
+
+def fileReader(file_object):
+ """generator to read chunks of 16384 octets
+ of provided file_object:
+ """
+ while True:
+ data = file_object.read(2<<13)
+ if not data:
+ raise StopIteration
+ yield data
+
+def sql_dump_splitter():
+ usage = "usage: %prog dump_sql"
+ parser = OptionParser(usage=usage)
+ parser.add_option('-m', '--max_allowed_inserts_by_commit',
+ dest='max_allowed_insert', type="long", default=10000L)
+ parser.add_option('-w', '--working_directory',
+ dest='working_directory', default='/tmp')
+ (options, args) = parser.parse_args()
+
+ if len(args) != 1 :
+ print parser.print_help()
+ parser.error('incorrect number of arguments')
+ dump_sql_path = args[0]
+
+ dump_file_object = open(dump_sql_path, 'r')
+ max_allowed_insert = options.max_allowed_insert
+ working_directory = options.working_directory
+
+ read_valid_sql_dump = False
+ reading_global_variable = True
+ footer_list = []
+ handler_list = []
+ global_variable_data_list = [] # to append in each file
+ current_table_name = ''
+ dumping_started = False
+ pre_command_dumping_started = False
+ post_command_dumping_started = False
+ insert_started = False
+ count_down = max_allowed_insert
+ last_not_insert = ''
+ between_two_table = False
+ for chunk in fileReader(dump_file_object):
+ #print 'len chunk%r' % len(chunk)
+ if chunk[:len(mysql_dump_start_flag)] == mysql_dump_start_flag:
+ read_valid_sql_dump = True
+ if not read_valid_sql_dump:
+ raise ValueError('SQL Dump not recognized')
+ for line in chunk.splitlines():
+ # for each lines except last one
+ if reading_global_variable:
+ global_variable_data_list.append(line)
+ if not current_table_name:
+ # If there is no table name, always try to
+ # discover a new one
+ match_object = table_structure_start_regex.search(line)
+ if match_object is not None:
+ current_table_name = match_object.group('table_name')
+ handler = TablePoolFileHandler(current_table_name,
+ max_allowed_insert,
+ working_directory,
+ global_variable_data_list,
+ insert_separator)
+ handler_list.append(handler)
+ print 'Find new table:%r' % current_table_name
+ reading_global_variable = False
+ between_two_table = False
+ if current_table_name:
+ if not dumping_started and not post_command_dumping_started:
+ # Drop table if exists parts
+ handler.appendTableStructureData(line)
+ if line == dumping_data_start_flag % current_table_name:
+ # Dumping Starting here.
+ dumping_started = True
+ if not insert_started and not between_two_table:
+ insert_command = insert_command_flag % current_table_name
+ if line[:len(insert_command)] == insert_command:
+ # INSERT INTO `table_name` VALUES
+ insert_started = True
+ handler.appendBeforeInsertDataList(insert_command)
+ line = line[len(insert_command):] # remove INSERT INTO parts
+ if dumping_started and not post_command_dumping_started and\
+ not insert_started:
+ handler.appendBeforeInsertDataList(line + '\n')
+ if insert_started:
+ # get all Values (...),(...),(...),
+ local_line = last_not_insert + line
+ unique_insert_list = local_line.split(insert_separator)
+ first_insert = unique_insert_list.pop(0)
+ if unique_insert_list:
+ # it means first is not unique and not truncated
+ handler.appendInsertionData(insert_separator + first_insert)
+ last_not_insert = unique_insert_list.pop() # remove last which
+ # could be truncated
+ [handler.appendInsertionData(insert_separator + insert)\
+ for insert in unique_insert_list[:-1]]
+ if unique_insert_list:
+ data = insert_separator + unique_insert_list[-1].rstrip(')')
+ handler.appendInsertionData(data)
+ else:
+ last_not_insert = first_insert
+ if ');' in last_not_insert:
+ # Condition just to minimize usage of regex
+ # which is a very inefficient regex
+ match_object = end_of_insertion_regex.search(last_not_insert)
+ if match_object is not None:
+ handler.appendInsertionData(insert_separator +\
+ match_object.group(1).rstrip(')'))
+ last_not_insert = ''
+ post_command_dumping_started = True
+ insert_started = False
+ dumping_started = False
+ if post_command_dumping_started:
+ handler.appendAfterInsertionData(line + '\n')
+ if line == commit_flag:
+ # Usefull to detect end of dump.
+ # a new table can follow
+ print 'End of dump for table:%r' % current_table_name
+ current_table_name = ''
+ dumping_started = False
+ insert_started = False
+ post_command_dumping_started = False
+ between_two_table = True
+ if between_two_table and line.strip() and line not in ('--', 'commit;',):
+ footer_list.append(line)
+
+ for handler in handler_list:
+ handler.serialise(footer_list)
+ print 'Done'
+
+
+def sql_dump_importer():
+ """Function able to parse directory structure
+ created by sql_dump_splitter
+ """
+ usage = "usage: %prog [options] database working_directory"
+ parser = OptionParser(usage=usage)
+ parser.add_option('-b', '--binary',
+ dest='binary', default='mysql')
+ parser.add_option('-u', '--user',
+ dest='user', default='root')
+ parser.add_option('-p', '--password',
+ dest='password', default='')
+ parser.add_option('-H', '--host',
+ dest='host', default='localhost')
+ parser.add_option('-P', '--port',
+ dest='port', default='3306')
+ (options, args) = parser.parse_args()
+
+ if len(args) != 2 :
+ print parser.print_help()
+ parser.error('incorrect number of arguments')
+ database = args[0]
+ working_directory = args[1]
+
+ command_string = '%s -u %s -h %s -P %s %s' % (options.binary,
+ options.user,
+ options.host,
+ options.port,
+ database)
+ if options.password:
+ command_string += '-p %s' % options.password
+
+ def runCommandForFileList(command, file_list):
+ for filename in file_list:
+ print 'Running:%r' % (command_string + ' < ' + filename)
+ process = Popen(shlex.split(command_string), stdin=open(filename, 'r'),
+ stdout=PIPE, stderr=PIPE)
+ stdout, stderr = process.communicate()
+ if stderr:
+ raise ValueError('SQL Import failed:\n%r' % stderr)
+ # first run create tables;
+ filename_list = glob(working_directory.rstrip(os.sep)+'/*/*%s'% table_structure_prefix)
+ runCommandForFileList(command_string, filename_list)
+ filename_dump_list = [filepath for filepath in \
+ glob(working_directory.rstrip(os.sep)+'/*/*.sql')\
+ if table_structure_prefix not in filepath]
+ filename_dump_list.sort()
+ # then import files which are containing data
+ runCommandForFileList(command_string, filename_dump_list)
More information about the Erp5-report
mailing list