import gzip import json import logging import os import shutil import tempfile from galaxy.datatypes import checkers from tool_shed.tools import data_table_manager from tool_shed.util import basic_util from tool_shed.util import hg_util from tool_shed.util import shed_util_common as suc import tool_shed.repository_types.util as rt_util log = logging.getLogger( __name__ ) UNDESIRABLE_DIRS = [ '.hg', '.svn', '.git', '.cvs' ] UNDESIRABLE_FILES = [ '.hg_archival.txt', 'hgrc', '.DS_Store', 'tool_test_output.html', 'tool_test_output.json' ] def check_archive( repository, archive ): for member in archive.getmembers(): # Allow regular files and directories only if not ( member.isdir() or member.isfile() or member.islnk() ): message = "Uploaded archives can only include regular directories and files (no symbolic links, devices, etc). " message += "The problematic member in this archive is %s," % str( member.name ) return False, message for item in [ '.hg', '..', '/' ]: if member.name.startswith( item ): message = "Uploaded archives cannot contain .hg directories, absolute filenames starting with '/', or filenames with two dots '..'. " message += "The problematic member in this archive is %s." % str( member.name ) return False, message if member.name in [ 'hgrc' ]: message = "Uploaded archives cannot contain hgrc files. " message += "The problematic member in this archive is %s." % str( member.name ) return False, message if repository.type == rt_util.REPOSITORY_SUITE_DEFINITION and member.name != rt_util.REPOSITORY_DEPENDENCY_DEFINITION_FILENAME: message = 'Repositories of type Repository suite definition can contain only a single file named repository_dependencies.xml.' message += 'This archive contains a member named %s.' % str( member.name ) return False, message if repository.type == rt_util.TOOL_DEPENDENCY_DEFINITION and member.name != rt_util.TOOL_DEPENDENCY_DEFINITION_FILENAME: message = 'Repositories of type Tool dependency definition can contain only a single file named tool_dependencies.xml.' message += 'This archive contains a member named %s.' % str( member.name ) return False, message return True, '' def check_file_contents_for_email_alerts( app ): """ See if any admin users have chosen to receive email alerts when a repository is updated. If so, the file contents of the update must be checked for inappropriate content. """ sa_session = app.model.context.current admin_users = app.config.get( "admin_users", "" ).split( "," ) for repository in sa_session.query( app.model.Repository ) \ .filter( app.model.Repository.table.c.email_alerts != None ): email_alerts = json.loads( repository.email_alerts ) for user_email in email_alerts: if user_email in admin_users: return True return False def check_file_content_for_html_and_images( file_path ): message = '' if checkers.check_html( file_path ): message = 'The file "%s" contains HTML content.\n' % str( file_path ) elif checkers.check_image( file_path ): message = 'The file "%s" contains image content.\n' % str( file_path ) return message def get_change_lines_in_file_for_tag( tag, change_dict ): """ The received change_dict is the jsonified version of the changes to a file in a changeset being pushed to the Tool Shed from the command line. This method cleans and returns appropriate lines for inspection. """ cleaned_lines = [] data_list = change_dict.get( 'data', [] ) for data_dict in data_list: block = data_dict.get( 'block', '' ) lines = block.split( '\\n' ) for line in lines: index = line.find( tag ) if index > -1: line = line[ index: ] cleaned_lines.append( line ) return cleaned_lines def get_upload_point( repository, **kwd ): upload_point = kwd.get( 'upload_point', None ) if upload_point is not None: # The value of upload_point will be something like: database/community_files/000/repo_12/1.bed if os.path.exists( upload_point ): if os.path.isfile( upload_point ): # Get the parent directory upload_point, not_needed = os.path.split( upload_point ) # Now the value of uplaod_point will be something like: database/community_files/000/repo_12/ upload_point = upload_point.split( 'repo_%d' % repository.id )[ 1 ] if upload_point: upload_point = upload_point.lstrip( '/' ) upload_point = upload_point.rstrip( '/' ) # Now the value of uplaod_point will be something like: / if upload_point == '/': upload_point = None else: # Must have been an error selecting something that didn't exist, so default to repository root upload_point = None return upload_point def handle_bz2( repository, uploaded_file_name ): fd, uncompressed = tempfile.mkstemp( prefix='repo_%d_upload_bunzip2_' % repository.id, dir=os.path.dirname( uploaded_file_name ), text=False ) bzipped_file = bz2.BZ2File( uploaded_file_name, 'rb' ) while 1: try: chunk = bzipped_file.read( basic_util.CHUNK_SIZE ) except IOError: os.close( fd ) os.remove( uncompressed ) log.exception( 'Problem uncompressing bz2 data "%s": %s' % ( uploaded_file_name, str( e ) ) ) return if not chunk: break os.write( fd, chunk ) os.close( fd ) bzipped_file.close() shutil.move( uncompressed, uploaded_file_name ) def handle_directory_changes( app, host, username, repository, full_path, filenames_in_archive, remove_repo_files_not_in_tar, new_repo_alert, commit_message, undesirable_dirs_removed, undesirable_files_removed ): repo = hg_util.get_repo_for_repository( app, repository=repository, repo_path=None, create=False ) content_alert_str = '' files_to_remove = [] filenames_in_archive = [ os.path.join( full_path, name ) for name in filenames_in_archive ] if remove_repo_files_not_in_tar and not repository.is_new( app ): # We have a repository that is not new (it contains files), so discover those files that are in the # repository, but not in the uploaded archive. for root, dirs, files in os.walk( full_path ): if root.find( '.hg' ) < 0 and root.find( 'hgrc' ) < 0: for undesirable_dir in UNDESIRABLE_DIRS: if undesirable_dir in dirs: dirs.remove( undesirable_dir ) undesirable_dirs_removed += 1 for undesirable_file in UNDESIRABLE_FILES: if undesirable_file in files: files.remove( undesirable_file ) undesirable_files_removed += 1 for name in files: full_name = os.path.join( root, name ) if full_name not in filenames_in_archive: files_to_remove.append( full_name ) for repo_file in files_to_remove: # Remove files in the repository (relative to the upload point) that are not in # the uploaded archive. try: hg_util.remove_file( repo.ui, repo, repo_file, force=True ) except Exception, e: log.debug( "Error removing files using the mercurial API, so trying a different approach, the error was: %s" % str( e )) relative_selected_file = repo_file.split( 'repo_%d' % repository.id )[1].lstrip( '/' ) repo.dirstate.remove( relative_selected_file ) repo.dirstate.write() absolute_selected_file = os.path.abspath( repo_file ) if os.path.isdir( absolute_selected_file ): try: os.rmdir( absolute_selected_file ) except OSError, e: # The directory is not empty. pass elif os.path.isfile( absolute_selected_file ): os.remove( absolute_selected_file ) dir = os.path.split( absolute_selected_file )[0] try: os.rmdir( dir ) except OSError, e: # The directory is not empty. pass # See if any admin users have chosen to receive email alerts when a repository is updated. # If so, check every uploaded file to ensure content is appropriate. check_contents = check_file_contents_for_email_alerts( app ) for filename_in_archive in filenames_in_archive: # Check file content to ensure it is appropriate. if check_contents and os.path.isfile( filename_in_archive ): content_alert_str += check_file_content_for_html_and_images( filename_in_archive ) hg_util.add_changeset( repo.ui, repo, filename_in_archive ) if filename_in_archive.endswith( 'tool_data_table_conf.xml.sample' ): # Handle the special case where a tool_data_table_conf.xml.sample file is being uploaded # by parsing the file and adding new entries to the in-memory app.tool_data_tables # dictionary. tdtm = data_table_manager.ToolDataTableManager( app ) error, message = tdtm.handle_sample_tool_data_table_conf_file( filename_in_archive, persist=False ) if error: return False, message, files_to_remove, content_alert_str, undesirable_dirs_removed, undesirable_files_removed hg_util.commit_changeset( repo.ui, repo, full_path_to_changeset=full_path, username=username, message=commit_message ) admin_only = len( repository.downloadable_revisions ) != 1 suc.handle_email_alerts( app, host, repository, content_alert_str=content_alert_str, new_repo_alert=new_repo_alert, admin_only=admin_only ) return True, '', files_to_remove, content_alert_str, undesirable_dirs_removed, undesirable_files_removed def handle_gzip( repository, uploaded_file_name ): fd, uncompressed = tempfile.mkstemp( prefix='repo_%d_upload_gunzip_' % repository.id, dir=os.path.dirname( uploaded_file_name ), text=False ) gzipped_file = gzip.GzipFile( uploaded_file_name, 'rb' ) while 1: try: chunk = gzipped_file.read( basic_util.CHUNK_SIZE ) except IOError, e: os.close( fd ) os.remove( uncompressed ) log.exception( 'Problem uncompressing gz data "%s": %s' % ( uploaded_file_name, str( e ) ) ) return if not chunk: break os.write( fd, chunk ) os.close( fd ) gzipped_file.close() shutil.move( uncompressed, uploaded_file_name ) def uncompress( repository, uploaded_file_name, uploaded_file_filename, isgzip=False, isbz2=False ): if isgzip: handle_gzip( repository, uploaded_file_name ) return uploaded_file_filename.rstrip( '.gz' ) if isbz2: handle_bz2( repository, uploaded_file_name ) return uploaded_file_filename.rstrip( '.bz2' )