2014-06-13 06:39:02 +08:00
#!/usr/bin/env python3
from __future__ import print_function #defaults to Python 3, but should also work in 2.7
2014-07-28 16:10:12 +08:00
""" Functions for checking fgdata for various problems (and one for creating smaller/split versions of it)
2014-06-13 06:39:02 +08:00
2014-07-28 16:10:12 +08:00
By Rebecca Palmer """
2014-06-13 06:39:02 +08:00
import os
import os . path
import re
from collections import defaultdict
import subprocess
import math
2014-06-14 05:38:48 +08:00
import tarfile
2014-07-28 16:10:12 +08:00
import gzip
def path_join ( * args ) :
""" Unlike plain os.path.join, this doesn ' t add a trailing / if the last component is empty """
return os . path . normpath ( os . path . join ( * args ) )
2014-06-13 06:39:02 +08:00
def rfilelist ( path , exclude_dirs = [ ] ) :
""" Dict of files/sizes in path, including those in any subdirectories (as relative paths) """
files = defaultdict ( int )
dirs = [ " " ]
while dirs :
cdir = dirs . pop ( )
2014-07-28 16:10:12 +08:00
cdirfiles = os . listdir ( path_join ( path , cdir ) )
2014-06-13 06:39:02 +08:00
for file in cdirfiles :
2014-07-28 16:10:12 +08:00
if os . path . isdir ( path_join ( path , cdir , file ) ) :
if path_join ( cdir , file ) not in exclude_dirs :
dirs . append ( path_join ( cdir , file ) )
2014-06-13 06:39:02 +08:00
else :
2014-07-28 16:10:12 +08:00
files [ path_join ( cdir , file ) ] = os . path . getsize ( path_join ( path , cdir , file ) )
2014-06-13 06:39:02 +08:00
return files
2014-07-28 16:10:12 +08:00
def strip_comments ( text , comment_types = None , filename = None ) :
""" Remove comments from text
Assumes comments don ' t nest (including different types of comments: will be wrong for e.g. /* aaa // bbb */ will-remove-this in C++ if // are removed first)
Doesn ' t check for being inside a string literal, and doesn ' t check for line - start * in C / * . . . * / """
if comment_types is None :
if filename is None :
raise TypeError ( " must give either filename or comment_types " )
if os . path . splitext ( filename ) [ 1 ] in ( " .xml " , " .eff " ) :
comment_types = ( ( " <!-- " , " --> " , " " ) , )
elif os . path . splitext ( filename ) [ 1 ] in ( " .c " , " .cpp " , " .cxx " , " .h " , " .hpp " , " .hxx " , " .frag " , " .vert " ) :
comment_types = ( ( " // " , " \n " , " \n " ) , ( " /* " , " */ " , " " ) )
elif os . path . splitext ( filename ) [ 1 ] in ( " .nas " , ) :
comment_types = ( ( " # " , " \n " , " \n " ) , )
else :
comment_types = [ ]
if type ( text ) in ( bytes , bytearray ) :
comment_types = [ [ bytes ( c , encoding = " ascii " ) for c in ct ] for ct in comment_types ]
for comment_type in comment_types :
text = text . split ( comment_type [ 0 ] , maxsplit = 1 ) [ 0 ] + comment_type [ 2 ] . join ( s . split ( comment_type [ 1 ] , maxsplit = 1 ) [ 1 ] for s in text . split ( comment_type [ 0 ] ) [ 1 : ] if comment_type [ 1 ] in s )
return text
2014-06-14 05:38:48 +08:00
def files_used ( pattern , path , exclude_dirs = [ ] , filelist = None , filetypes = None , relative_path = False ) :
""" Files used by an element matching pattern, in a file in path or filelist """
2014-06-13 06:39:02 +08:00
textures = [ ]
2014-06-14 05:38:48 +08:00
if filelist is None :
filelist = rfilelist ( path , exclude_dirs ) . keys ( )
if filetypes is not None :
filelist = [ f for f in filelist if os . path . splitext ( f ) [ 1 ] in filetypes ]
2014-06-13 06:39:02 +08:00
texfind = re . compile ( pattern )
2014-06-14 05:38:48 +08:00
for file in filelist :
try :
2014-07-28 16:10:12 +08:00
f = open ( path_join ( path , file ) , ' r ' , errors = ' replace ' )
2014-06-14 05:38:48 +08:00
except FileNotFoundError :
continue
2014-06-13 06:39:02 +08:00
for line in f :
tex = texfind . search ( line )
if tex :
2014-06-14 05:38:48 +08:00
if relative_path :
2014-07-28 16:10:12 +08:00
textures . append ( os . path . normpath ( path_join ( os . path . dirname ( file ) , tex . group ( 1 ) . replace ( ' \\ ' , ' / ' ) ) ) )
2014-06-14 05:38:48 +08:00
else :
textures . append ( os . path . normpath ( tex . group ( 1 ) . replace ( ' \\ ' , ' / ' ) ) )
2014-06-13 06:39:02 +08:00
return textures
2014-07-28 16:10:12 +08:00
def find_unused_textures ( basedir , output_lists = True , grep_check = False , output_rsync_rules = False , output_comparison_strips = False , output_removal_commands = False , return_used_noregions = False ) :
2014-06-13 06:39:02 +08:00
""" Checks if any textures are unused (wasting space), and if any textures are only available as .dds (not recommended in the source repository, as it is a lossy-compressed format)
Set basedir to your fg - root , and enable the kind ( s ) of output you want :
output_lists prints lists of unused textures , and of dds - only textures
grep_check checks for possible use outside the normal directories ; requires Unix shell
output_rsync_rules prints rsync rules for excluding unused textures from the release flightgear - data . Warning : if you use this , re - run this script regularly , in case they start being used
2014-06-14 05:38:48 +08:00
output_comparison_strips creates thumbnail strips , unused_duplicate . png / unused_dds . png / high_low . png , for visually checking whether same - name textures are the same ( remove the unused one entirely ) or different ( move it to Unused ) ; requires imagemagick or graphicsmagick
2014-06-13 06:39:02 +08:00
output_removal_commands creates another script , delete_unused_textures . sh , which will remove unused textures when run in a Unix shell """
2014-06-18 04:35:49 +08:00
false_positives = set ( [ ' buildings-lightmap.png ' , ' buildings.png ' , ' Credits ' , ' Globe/00README.txt ' , ' Globe/01READMEocean_depth_1png.txt ' , ' Globe/world.topo.bathy.200407.3x4096x2048.png ' , ' Trees/convert.pl ' , ' Splash1.png ' , ' Splash2.png ' , ' Splash3.png ' , ' Splash4.png ' , ' Splash5.png ' ] ) #these either aren't textures, or are used where we don't check; 'unknown.rgb','Terrain/unknown.rgb' are also referenced, but already don't exist
2014-07-28 16:10:12 +08:00
used_textures = set ( files_used ( path = path_join ( basedir , ' Materials ' ) , pattern = r ' <(?:texture|object-mask|tree-texture).*?>( \ S+?)</(texture|object-mask|tree-texture) ' ) ) | false_positives
used_textures_noregions = set ( files_used ( path = path_join ( basedir , ' Materials ' ) , exclude_dirs = [ ' regions ' ] , pattern = r ' <(?:texture|object-mask|tree-texture).*?>( \ S+?)</(texture|object-mask|tree-texture) ' ) ) | false_positives #this pattern matches a <texture> (possibly with number), <tree-texture> or <object-mask> element
used_effectslow = set ( files_used ( path = path_join ( basedir , ' Effects ' ) , pattern = r ' image.*?>[ \\ /]?Textures[ \\ /]( \ S+?)</.*?image ' ) ) | set ( files_used ( path = path_join ( basedir , ' Materials ' ) , pattern = r ' <building-(?:texture|lightmap).*?>Textures[ \\ /]( \ S+?)</building-(?:texture|lightmap) ' ) ) #Effects (<image>), and Materials <building-texture>/<building-lightmap>, explicitly includes the Textures/ or Textures.high/
used_effectshigh = set ( files_used ( path = path_join ( basedir , ' Effects ' ) , pattern = r ' image.*?>[ \\ /]?Textures.high[ \\ /]( \ S+?)</.*?image ' ) ) | set ( files_used ( path = path_join ( basedir , ' Materials ' ) , pattern = r ' <building-(?:texture|lightmap).*?>Textures.high[ \\ /]( \ S+?)</building-(?:texture|lightmap) ' ) )
high_tsizes = rfilelist ( path_join ( basedir , ' Textures.high ' ) )
2014-06-13 06:39:02 +08:00
high_textures = set ( high_tsizes . keys ( ) )
2014-07-28 16:10:12 +08:00
low_tsizes = rfilelist ( path_join ( basedir , ' Textures ' ) , exclude_dirs = [ ' Sky ' , ' Unused ' ] ) #sky textures are used where we don't check
2014-06-13 06:39:02 +08:00
low_textures = set ( low_tsizes . keys ( ) )
only_high = high_textures - low_textures
used_noreg_onlyhigh = ( only_high & used_textures_noregions ) | used_effectshigh
used_noreg_onlyhighsize = sum ( high_tsizes [ t ] for t in used_noreg_onlyhigh )
used_noreg_low = ( low_textures & used_textures_noregions ) | used_effectslow
2014-06-18 04:35:49 +08:00
used_noregions = used_textures_noregions | used_effectshigh | used_effectslow
2014-06-13 06:39:02 +08:00
used_noreg_lowsize = sum ( low_tsizes [ t ] for t in used_noreg_low )
used_noreg_defsize = sum ( low_tsizes [ t ] for t in ( used_textures_noregions - high_textures ) | used_effectslow ) + sum ( high_tsizes [ t ] for t in used_textures_noregions | used_effectshigh )
used_defsize = sum ( low_tsizes [ t ] for t in ( used_textures - high_textures ) | used_effectslow ) + sum ( high_tsizes [ t ] for t in used_textures | used_effectshigh )
unused = ( high_textures | low_textures ) - used_textures - used_effectslow - used_effectshigh
t_size = lambda tset : sum ( high_tsizes [ t ] for t in tset ) + sum ( low_tsizes [ t ] for t in tset )
missing = ( used_textures - ( high_textures | low_textures ) ) | ( used_effectslow - low_textures ) | ( used_effectshigh - high_textures )
if missing :
raise ValueError ( " Some used textures not found: " + repr ( missing ) )
sourceless = [ f for f in ( high_textures | low_textures ) if ( f [ - 4 : ] == " .dds " and f [ : - 4 ] + " .png " not in high_textures and ( f in high_textures or f [ : - 4 ] + " .png " not in low_textures ) ) ] + [ ' Terrain/airport.dds ' ] #airport.dds isn't the same as airport.png; crop-colors.dds/cropgrass-colors.dds/rock-colors.dds/forest-colors.dds also differ but only in strip width, which doesn't matter as they are 1D color strips
sourceless_used = set ( sourceless ) - unused
needed_as_source = [ f for f in unused if ( f [ - 4 : ] != " .dds " and f [ : - 4 ] + " .png " in ( used_textures | used_effectslow | used_effectshigh ) or f [ : - 4 ] + " .dds " in ( used_textures | used_effectslow | used_effectshigh ) ) ] + [ ' Runway/designation_letters.svg ' ]
known_non_duplicates = [ ' deciduous.png ' , ' drycrop.png ' , ' irrcrop.png ' , ' marsh1.png ' , ' gravel.png ' , ' Town.png ' , ' grass.png ' , ' mixedcrop.png ' , ' resgrid.png ' ] + [ ' glacier.png ' , ' rock.png ' , ' cropgrass.png ' ] #first group real winter textures, second group unrelated textures
unused_duplicate = [ f for f in unused if ( f [ 0 : 14 ] == " Terrain.winter " and " Terrain " + f [ 14 : ] in ( high_textures | low_textures ) and f [ 15 : ] not in known_non_duplicates ) ]
unused_dds = set ( f for f in ( unused - set ( unused_duplicate ) ) if ( f [ - 4 : ] == " .dds " and f [ : - 4 ] + " .png " in ( high_textures | low_textures ) and f != ' Terrain/airport.dds ' ) ) #airport.dds isn't the same as airport.png; crop-colors.dds/cropgrass-colors.dds/rock-colors.dds/forest-colors.dds also differ but only in strip width, which doesn't matter as they are 1D color strips
unused_other = unused - set ( unused_duplicate ) - set ( unused_dds ) - set ( needed_as_source )
known_highlow_mismatch = set ( [ ' Terrain.winter/mixedcrop4.png ' , ' Terrain.winter/cropgrass3.png ' , ' Terrain.winter/drycrop4.png ' , ' Terrain.winter/irrcrop2.png ' , ' Terrain.winter/drycrop1.png ' , ' Terrain.winter/drycrop3.png ' , ' Terrain.winter/mixedcrop1.png ' , ' Terrain.winter/ mixedforest2.png ' , ' Terrain.winter/cropgrass2.png ' , ' Terrain.winter/cropgrass1.png ' , ' Terrain.winter/tundra.png ' , ' Terrain.winter/mixedforest3.png ' , ' Terrain.winter/shrub2.png ' , ' Terrain.winter/drycrop2.png ' , ' Terrain.winter/deciduous1.png ' , ' Terrain.winter/ mixedcrop3.png ' , ' Terrain.winter/naturalcrop1.png ' ] + [ ' Terrain.winter/tundra3.png ' , ' Terrain.winter/forest1c.png ' , ' Terrain.winter/herbtundra.png ' ] + [ ' Terrain/grass_rwy.dds ' , ' Terrain/cropwood.dds ' , ' Terrain/herbtundra.dds ' , ' Terrain/irrcrop.dds ' , ' Terrain/shrub.dds ' , ' Terrain.winter/mixedforest.png ' , ' Runway/pa_taxiway.png ' , ' Runway/pc_taxiway.png ' ] ) #first group are different degrees of snow cover on the same base texture, last group unrelated textures, middle group hard to tell; p{a,c}_taxiway (only low-res has side lines) are also mismatched in .dds, but as each .dds matches its size .png, only the .png needs to be kept in Unused
lowres_maybe_source = [ ' Terrain/lava1.png ' , ' Terrain/lava2.png ' , ' Terrain/lava3.png ' , ' Terrain/sand4.png ' , ' Terrain/sand5.png ' , ' Terrain/sand6.png ' ] #these are clearly related, but the high-res version has unnatural-looking high-frequency noise, suggesting that the low-res version might be the original: keep it
unused_dds_matchhigh = set ( f for f in ( unused_dds & known_highlow_mismatch ) if f [ : - 4 ] + " .png " not in low_textures )
unused_dds_matchlow = set ( f for f in ( unused_dds & known_highlow_mismatch ) if f [ : - 4 ] + " .png " not in high_textures )
low_unneeded = ( high_textures & low_textures ) - used_effectslow - unused - set ( lowres_maybe_source )
low_unneeded_duplicate = low_unneeded - set ( known_highlow_mismatch )
low_unneeded_nondup = low_unneeded & set ( known_highlow_mismatch )
def image_check_strip ( basedir , index_fname , ilist1 , ilist2 = None , size = 128 ) :
""" Generate two rows of thumbnails, for easy visual comparison (between the two lists given, or if a single list is given, between low and high resolution) """
if ilist2 is None :
2014-07-28 16:10:12 +08:00
ipairs = [ [ path_join ( basedir , ' Textures ' , f ) , path_join ( basedir , ' Textures.high ' , f ) ] for f in ilist1 ]
2014-06-13 06:39:02 +08:00
else :
ipairs = [ ]
for f1 , f2 in zip ( ilist1 , ilist2 ) :
if f1 in low_textures :
2014-07-28 16:10:12 +08:00
ipairs . append ( [ path_join ( basedir , ' Textures ' , f1 ) , path_join ( basedir , ' Textures ' , f2 ) if f2 in low_textures else path_join ( basedir , ' Textures.high ' , f2 ) ] )
2014-06-13 06:39:02 +08:00
if f1 in high_textures :
2014-07-28 16:10:12 +08:00
ipairs . append ( [ path_join ( basedir , ' Textures.high ' , f1 ) , path_join ( basedir , ' Textures.high ' , f2 ) if f2 in high_textures else path_join ( basedir , ' Textures ' , f2 ) ] )
2014-06-13 06:39:02 +08:00
ilist_f = [ f [ 0 ] for f in ipairs ] + [ f [ 1 ] for f in ipairs ]
subprocess . call ( [ ' montage ' , ' -label ' , " ' %f ' " ] + ilist_f + [ ' -tile ' , ' x2 ' , ' -geometry ' , str ( size ) + ' x ' + str ( size ) ] + [ index_fname ] )
def rsync_rules ( basedir , flist , include = False , high = None ) :
""" Output rsync rules to exclude/include the specified textures from high/low/both (high=True/False/None) resolutions """
for f in flist :
if high != True and f in low_textures :
2014-07-28 16:10:12 +08:00
print ( " + " if include else " - " , path_join ( ' /fgdata/Textures ' , f ) )
2014-06-13 06:39:02 +08:00
if high != False and f in high_textures :
2014-07-28 16:10:12 +08:00
print ( " + " if include else " - " , path_join ( ' /fgdata/Textures.high ' , f ) )
2014-06-13 06:39:02 +08:00
def removal_command ( basedir , flist , high = None ) :
""" Return command to delete the specified textures from high/low/both (high=True/False/None) resolutions """
if not flist :
return " "
a = " rm "
for f in flist :
if high != True and f in low_textures :
2014-07-28 16:10:12 +08:00
a = a + " " + path_join ( ' Textures ' , f )
2014-06-13 06:39:02 +08:00
if high != False and f in high_textures :
2014-07-28 16:10:12 +08:00
a = a + " " + path_join ( ' Textures.high ' , f )
2014-06-13 06:39:02 +08:00
a = a + " \n "
return a
def move_command ( basedir , flist , high = None , comment = False ) :
""" Return command to move the specified textures to Unused from high/low/both (high=True/False/None) resolutions """
if not flist :
return " "
dirset_low = set ( ) if high == True else set ( os . path . dirname ( f ) for f in set ( flist ) & low_textures )
dirset_high = set ( ) if high == False else set ( os . path . dirname ( f ) for f in set ( flist ) & high_textures )
a = " "
for d in dirset_low :
2014-07-28 16:10:12 +08:00
a = a + ( " # " if comment else " " ) + " mv --target-directory= " + path_join ( " Textures/Unused " , d ) + " " + ( " " . join ( path_join ( " Textures " , f ) for f in flist if ( os . path . dirname ( f ) == d and f in low_textures ) ) ) + " \n "
2014-06-13 06:39:02 +08:00
for d in dirset_high :
2014-07-28 16:10:12 +08:00
a = a + ( " # " if comment else " " ) + " mv --target-directory= " + path_join ( " Textures/Unused " , d + " .high " ) + " " + ( " " . join ( path_join ( " Textures.high " , f ) for f in flist if ( os . path . dirname ( f ) == d and f in high_textures ) ) ) + " \n "
2014-06-13 06:39:02 +08:00
return a
if output_comparison_strips :
image_check_strip ( basedir , " unused_duplicate.png " , unused_duplicate , [ " Terrain " + f [ 14 : ] for f in unused_duplicate ] )
image_check_strip ( basedir , " unused_dds.png " , unused_dds , [ f [ : - 4 ] + " .png " for f in unused_dds ] )
image_check_strip ( basedir , " high_low.png " , high_textures & low_textures )
#image_check_strip(basedir,"high_low2.png",[f for f in high_textures&low_textures if (f[0:14]=="Terrain.winter" or "_taxiway." in f or "lava" in f or "sand" in f)],size=512)#closer look at the doubtful cases
if output_lists :
print ( " \n \n unused-winter same as normal: " , sorted ( unused_duplicate ) , " \n size= " , t_size ( unused_duplicate ) , " \n \n unused-dds with matching png: " , sorted ( unused_dds ) , " \n size= " , t_size ( unused_dds ) , " \n \n unused-unique: " , sorted ( unused_other ) , " \n size= " , t_size ( unused_other ) , " \n \n not directly used but keep as source: " , sorted ( needed_as_source ) , " \n size= " , t_size ( needed_as_source ) , " \n \n unused low, matches high: " , sorted ( low_unneeded_duplicate ) , " \n size= " , sum ( low_tsizes [ f ] for f in low_unneeded_duplicate ) , " \n \n unused low, unique: " , sorted ( low_unneeded_nondup ) , " \n size= " , sum ( low_tsizes [ f ] for f in low_unneeded_nondup ) , " \n \n all non-sky textures size= " , sum ( high_tsizes . values ( ) ) + sum ( low_tsizes . values ( ) ) , " used size= " , used_defsize , " used no-regions size= " , used_noreg_defsize , " \n \n not found: " , sorted ( missing ) , " \n \n .dds only/highest-res: " , sorted ( sourceless ) , " \n \n .dds only/highest-res, used: " , sorted ( sourceless_used ) )
#not really meaningful after removing low-res duplicates: ,"\n\nused high-only, not regions:",sorted(used_noreg_onlyhigh),"\nsize=",used_noreg_onlyhighsize,"these+used low (i.e. minimal flightgear-data) size=",used_noreg_onlyhighsize+used_noreg_lowsize
if grep_check :
unused_f = [ os . path . basename ( f ) for f in unused ]
all_f = [ os . path . basename ( f ) for f in ( high_textures | low_textures ) ]
print ( " \n \n Possible use outside main search: " ) #used to set false_positives
subprocess . call ( [ " grep " , " -r " , " -E " , " --exclude-dir=Aircraft " , " --exclude-dir=.git " , " -e " , " ( " + " )|( " . join ( unused ) + " ) " , " /home/palmer/fs_dev/git/fgdata " , " /home/palmer/fs_dev/git/flightgear " , " /home/palmer/fs_dev/git/simgear " ] ) #everywhere using full names
subprocess . call ( [ " grep " , " -r " , " -E " , " --exclude-dir=Aircraft " , " --exclude-dir=Textures.high " , " --exclude-dir=Models " , " --exclude-dir=Materials " , " --exclude-dir=Effects " , " --exclude-dir=.git " , " -e " , " ( " + " )|( " . join ( all_f ) + " ) " , " /home/palmer/fs_dev/git/fgdata " , " /home/palmer/fs_dev/git/flightgear " , " /home/palmer/fs_dev/git/simgear " ] ) #restricted (to avoid false positives from Terrain.winter vs Terrain) using filenames
subprocess . call ( [ " grep " , " -r " , " -E " , " --exclude-dir=Aircraft " , " --exclude-dir=Textures.high " , " --exclude-dir=Models " , " --exclude-dir=Materials " , " --exclude-dir=Effects " , " --exclude-dir=.git " , " -e " , ' [. " \' ]dds ' , " /home/palmer/fs_dev/git/fgdata " , " /home/palmer/fs_dev/git/flightgear " , " /home/palmer/fs_dev/git/simgear " ] ) #check for programmatic .png -> .dds swap; none found
print ( " \n \n Use of sourceless textures: " )
subprocess . call ( [ " grep " , " -r " , " -E " , " --exclude-dir=Aircraft " , " --exclude-dir=.git " , " -e " , " ( " + " )|( " . join ( sourceless ) + " ) " , " /home/palmer/fs_dev/git/fgdata " , " /home/palmer/fs_dev/git/flightgear " , " /home/palmer/fs_dev/git/simgear " ] )
if output_rsync_rules :
print ( " \n \n Full flightgear-data: \n " )
rsync_rules ( basedir , unused )
rsync_rules ( basedir , low_unneeded , high = False )
print ( " \n \n Minimal flightgear-data: \n " )
rsync_rules ( basedir , low_textures - used_noreg_low , high = False )
rsync_rules ( basedir , high_textures - used_noreg_onlyhigh , high = True )
if output_removal_commands :
r_script = open ( ' delete_unused_textures.sh ' , ' w ' )
r_script . write ( " cd " + basedir + " \n " )
r_script . write ( " #Unused duplicates \n " )
r_script . write ( removal_command ( basedir , unused_duplicate ) )
r_script . write ( " #Unused .dds versions \n " )
r_script . write ( removal_command ( basedir , unused_dds - unused_dds_matchhigh , high = False ) )
r_script . write ( removal_command ( basedir , unused_dds - unused_dds_matchlow , high = True ) )
r_script . write ( " #Unused reduced-resolution versions \n " )
r_script . write ( removal_command ( basedir , low_unneeded_duplicate | ( unused_other & high_textures & low_textures ) - set ( lowres_maybe_source ) , high = False ) )
r_script . write ( " #Unused unique .png (move to Unused) \n " )
r_script . write ( " \n " . join ( [ " mkdir -p Textures/Unused/ " + d for d in [ ' Terrain ' , ' Terrain.winter ' , ' Trees ' , ' Terrain.high ' , ' Terrain.winter.high ' , ' Trees.high ' , ' Runway ' , ' Water ' ] ] ) + " \n " )
r_script . write ( move_command ( basedir , [ f for f in unused_other & high_textures if ( f [ - 4 : ] != " .dds " and f [ : 5 ] != " Signs " and f [ : 6 ] != " Runway " ) ] , high = True ) )
r_script . write ( move_command ( basedir , [ f for f in ( unused_other - high_textures ) | low_unneeded_nondup if ( f [ - 4 : ] != " .dds " and f [ : 5 ] != " Signs " and f [ : 6 ] != " Runway " ) ] , high = False ) )
r_script . write ( " #Unused unique .dds \n " )
r_script . write ( " #It is my opinion that these should go, but if you ' d prefer to move them to Unused I won ' t argue further \n " )
r_script . write ( removal_command ( basedir , [ f for f in ( unused_other & high_textures ) | unused_dds_matchlow if ( f [ - 4 : ] == " .dds " and f [ : 5 ] != " Signs " and f [ : 6 ] != " Runway " ) ] , high = True ) )
r_script . write ( removal_command ( basedir , [ f for f in ( unused_other - high_textures ) | low_unneeded_nondup | unused_dds_matchhigh if f [ - 4 : ] == " .dds " ] , high = False ) )
r_script . write ( move_command ( basedir , [ f for f in ( unused_other & high_textures ) | unused_dds_matchlow if ( f [ - 4 : ] == " .dds " and f [ : 5 ] != " Signs " and f [ : 6 ] != " Runway " ) ] , high = True , comment = True ) )
r_script . write ( move_command ( basedir , [ f for f in ( unused_other - high_textures ) | low_unneeded_nondup | unused_dds_matchhigh if ( f [ - 4 : ] == " .dds " and f [ : 5 ] != " Signs " and f [ : 6 ] != " Runway " ) ] , high = False , comment = True ) )
r_script . close ( )
2014-06-18 04:35:49 +08:00
if return_used_noregions :
2014-07-28 16:10:12 +08:00
return used_noregions | set ( [ path_join ( ' Sky ' , f ) for f in rfilelist ( path_join ( basedir , ' Textures/Sky ' ) ) ] )
2014-06-14 05:38:48 +08:00
def find_locally_unused_models ( basedir ) :
""" Find models not used in the base scenery (these do need to be in Terrasync as they may well be used in other locations, but don ' t need to be in the base flightgear-data package)
2014-06-18 04:35:49 +08:00
Known bug : doesn ' t search everywhere: check /Nasal,.eff <image>,<inherits-from>,/(AI/)Aircraft not referenced in AI scenarios, unusual tags in Aircraft/Generic/Human/Models/walker.xml,HLA/av-aircraft.xml,/Environment,MP/aircraft_types.xml,preferences.xml " " "
2014-07-28 16:10:12 +08:00
models_allfiles = { path_join ( ' Models ' , f ) : s for f , s in rfilelist ( path_join ( basedir , ' Models ' ) ) . items ( ) }
2014-06-14 05:38:48 +08:00
t_size = lambda flist : sum ( models_allfiles [ f ] for f in flist if f in models_allfiles )
2014-07-28 16:10:12 +08:00
used_models = set ( files_used ( path = path_join ( basedir , ' Scenery ' ) , filetypes = " .stg " , pattern = r ' OBJECT_SHARED ( \ S+?) ' ) ) | set ( files_used ( path = path_join ( basedir , ' AI ' ) , exclude_dirs = [ " Aircraft " , " Traffic " ] , pattern = r ' <model>[ \\ /]?( \ S+?)</model> ' ) ) | set ( f for f in files_used ( path = path_join ( basedir , ' Materials ' ) , filetypes = " .xml " , pattern = r ' <path>[ \\ /]?( \ S+?)</path> ' ) if f [ - 4 : ] == " .xml " )
2014-06-14 05:38:48 +08:00
n = 0
while n != len ( used_models ) :
n = len ( used_models )
used_models = used_models | set ( f for f in files_used ( path = basedir , filelist = used_models , filetypes = " .xml " , pattern = r ' <path>[ \\ /]?( \ S+?)</path> ' ) if f [ - 4 : ] == " .xml " )
used_textures = set ( files_used ( path = basedir , filelist = used_models , filetypes = " .ac " , pattern = r ' texture " ( \ S+?) " ' , relative_path = True ) ) | set ( files_used ( path = basedir , filelist = used_models , filetypes = " .xml " , pattern = r ' <texture>[ \\ /]?( \ S+?)</texture> ' , relative_path = True ) )
extra_used_models = set ( )
for f1 in used_models :
if f1 [ - 4 : ] != " .xml " :
continue
for f2 in files_used ( path = basedir , filelist = [ f1 ] , pattern = r ' <path>[ \\ /]?( \ S+?)</path> ' , relative_path = True ) :
if f2 [ - 3 : ] != " .ac " :
continue
extra_used_models = extra_used_models | set ( [ f2 ] )
p2 = [ f for f in files_used ( path = basedir , filelist = [ f1 ] , pattern = r ' <texture-path>[ \\ /]?( \ S+?)</texture-path> ' , relative_path = True ) ]
if len ( p2 ) == 0 :
p2 = [ os . path . dirname ( f1 ) ]
if len ( p2 ) != 1 :
print ( " non-unique/not found: " , f1 , f2 , p2 )
continue
try :
2014-07-28 16:10:12 +08:00
used_textures = used_textures | set ( os . path . normpath ( path_join ( p2 [ 0 ] , f ) ) for f in files_used ( path = basedir , filelist = [ f2 ] , filetypes = " .ac " , pattern = r ' texture " ( \ S+?) " ' ) )
2014-06-14 05:38:48 +08:00
except ( IOError , OSError ) :
print ( " not found " , f1 , f2 , p2 )
used_models = used_models | extra_used_models
unused = set ( models_allfiles . keys ( ) ) - ( used_models | used_textures )
2014-07-28 16:10:12 +08:00
missing = set ( f for f in ( used_models | used_textures ) if ( ( f . startswith ( ' Models ' ) and f not in models_allfiles . keys ( ) ) or not os . path . isfile ( path_join ( basedir , f ) ) ) )
2014-06-14 05:38:48 +08:00
print ( " used \n " , sorted ( used_models ) , " \n size= " , t_size ( used_models ) , " \n \n " , sorted ( used_textures ) , " \n size= " , t_size ( used_textures ) , " \n \n unused \n " , sorted ( unused ) , " \n size= " , t_size ( unused ) , " \n \n missing \n " , sorted ( missing ) , " \n size= " , t_size ( missing ) )
2014-06-13 06:39:02 +08:00
def size_by_type ( path , exclude_dirs = [ ] ) :
""" Dict of total file size by file extension """
files = rfilelist ( path , exclude_dirs )
size_totals = defaultdict ( int )
for filename , size in files . items ( ) :
file_ext = os . path . splitext ( filename ) [ 1 ]
if file_ext == " .gz " :
file_ext = os . path . splitext ( os . path . splitext ( filename ) [ 0 ] ) [ 1 ] + file_ext
size_totals [ file_ext ] = size_totals [ file_ext ] + size
return size_totals
def size_by_size ( path , exclude_dirs = [ ] , exts = [ " .png " , " .dds " , " .rgb " ] ) :
""" Dict of total file size by individual file size range, of given extensions (empty list for all files) """
files = rfilelist ( path , exclude_dirs )
size_totals = defaultdict ( int )
for filename , size in files . items ( ) :
file_ext = os . path . splitext ( filename ) [ 1 ]
if ( not exts ) or ( file_ext in exts ) :
size_totals [ 2 * * math . frexp ( size ) [ 1 ] ] = size_totals [ 2 * * math . frexp ( size ) [ 1 ] ] + size
return size_totals
2014-06-14 05:38:48 +08:00
def fgdata_size ( path , dirs_to_list = [ " AI/Aircraft " , " AI/Traffic " , " Aircraft " , " Models " , " Scenery " , " Textures " , " Textures.high " ] , exclude_dirs = None , compressed_size = False , num_types = 3 ) :
if dirs_to_list is None :
2014-07-28 16:10:12 +08:00
dirs_to_list = [ d for d in os . listdir ( path ) if os . path . isdir ( path_join ( path , d ) ) ]
2014-06-14 05:38:48 +08:00
if exclude_dirs is None :
2014-07-28 16:10:12 +08:00
if os . path . exists ( path_join ( path , " .git " ) ) :
2014-06-14 05:38:48 +08:00
exclude_dirs = [ " .git " , " Aircraft " ]
else :
exclude_dirs = [ ]
total_compressed_size = 0
2014-06-13 06:39:02 +08:00
exclude_list = [ [ ] ] * len ( dirs_to_list ) + [ dirs_to_list + exclude_dirs ] + [ exclude_dirs ]
names_list = dirs_to_list + [ " other " , " all " ]
for n , dir1 in enumerate ( dirs_to_list + [ " " , " " ] ) :
2014-07-28 16:10:12 +08:00
size_totals = size_by_type ( path_join ( path , dir1 ) , exclude_list [ n ] )
2014-06-14 05:38:48 +08:00
print ( names_list [ n ] , sorted ( size_totals . items ( ) , key = lambda x : - x [ 1 ] ) [ : num_types ] , " total " , sum ( size_totals . values ( ) ) )
if compressed_size :
if names_list [ n ] == " all " :
print ( " compressed size " , total_compressed_size )
continue
targz = tarfile . open ( " fgdata_sizetest_temp.tar.gz " , mode = " w:gz " )
2014-07-28 16:10:12 +08:00
for file in rfilelist ( path_join ( path , dir1 ) , exclude_list [ n ] ) :
targz . add ( path_join ( path , dir1 , file ) )
2014-06-14 05:38:48 +08:00
targz . close ( )
print ( " compressed size " , os . path . getsize ( " fgdata_sizetest_temp.tar.gz " ) )
total_compressed_size = total_compressed_size + os . path . getsize ( " fgdata_sizetest_temp.tar.gz " )
2014-06-13 06:39:02 +08:00
2014-07-28 16:10:12 +08:00
def create_reduced_fgdata ( input_path , output_path , split_textures = True , exclude_parts = [ ] , include_aircraft = [ ' UIUC ' , ' 777 ' , ' 777-200 ' , ' b1900d ' , ' CitationX ' , ' ZLT-NT ' , ' dhc2 ' , ' Cub ' , ' sopwithCamel ' , ' f-14b ' , ' ASK13 ' , ' bo105 ' , ' Dragonfly ' , ' SenecaII ' , ' A6M2 ' ] , dirs_to_downsample = ( ) , downsample_min_filesize = 30000 ) :
2014-06-13 06:39:02 +08:00
""" Create a smaller, reduced-quality flightgear-data package
2014-07-28 16:10:12 +08:00
Can downsample textures 50 % and / or omit sections
Requires Unix shell ; downsampling requires imagemagick or graphicsmagick ( for convert ) and libnvtt - bin ( for nvcompress )
Optional parts , use exclude_parts to omit :
ai : no background traffic , but tankers etc do still work
extra - textures ( requires split_textures = True ) : no region - specific textures
The c172p and ufo are always included ; other aircraft are added by include_aircraft
Texture downsampling : textures in dirs_to_downsample and larger than downsample_min_filesize downsampled 50 %
Example : dirs_to_downsample = ( " Textures.high/Terrain " , " Textures.high/Trees " , " Textures.high/Terrain.winter " , " AI/Aircraft " , " Models " ) , downsample_min_filesize = 30000
To put each section in its own directory use { 0 } in output_path , e . g .
2014-07-29 03:47:43 +08:00
python3 - c " import fgdata_checkers; fgdata_checkers.create_reduced_fgdata(input_path= ' /home/palmer/fs_dev/git/fgdata ' ,output_path= ' /home/palmer/fs_dev/flightgear/data_split/debian/flightgear-data- {0} /usr/share/games/flightgear ' ,include_aircraft=[ ' UIUC ' , ' b1900d ' , ' CitationX ' , ' ZLT-NT ' , ' dhc2 ' , ' Cub ' , ' sopwithCamel ' , ' f-14b ' , ' ASK13 ' , ' bo105 ' , ' Dragonfly ' , ' SenecaII ' , ' A6M2 ' ]) "
2014-07-28 16:10:12 +08:00
This creates separate preferences - regions . xml and preferences - noregions . xml files for with and without regional textures ; you need to handle symlinking preferences . xml to the correct one
"""
2014-06-14 05:38:48 +08:00
texture_filetypes = { " .png " : " PNG " , " .dds " : " DDS " } #,".rgb":"SGI" loses cloud transparency
2014-06-18 04:35:49 +08:00
exclude_dirs = [ " .git " , " Textures/Unused " ]
2014-07-28 16:10:12 +08:00
exclude_unnamed_subdirs = [ " Aircraft " ] #these are a separate mechanism from subtree_class/exclude_parts mostly to save time (subtree_class still fully scans excluded directories because the class may change again further down the tree, e.g. AI/Aircraft ai -> performancedb.xml base; these don't)
subtree_class = { " Aircraft/c172p " : " base " , " Aircraft/Generic " : " base " , " Aircraft/Instruments " : " base " , " Aircraft/Instruments-3d " : " base " , " Aircraft/ufo " : " base " , " Textures " : " textures " , " Textures.high " : " textures " , " AI/Aircraft " : " ai " , " AI/Traffic " : " ai " , " AI/Aircraft/performancedb.xml " : " base " , " Scenery " : " scenery " , " Models " : " models " }
for aircraft in include_aircraft :
if " Aircraft/ " + aircraft not in subtree_class :
subtree_class [ " Aircraft/ " + aircraft ] = " aircraft "
2014-06-18 04:35:49 +08:00
include_files = [ ]
2014-07-28 16:10:12 +08:00
if split_textures :
base_texture_files = [ ]
for t in find_unused_textures ( input_path , return_used_noregions = True ) :
base_texture_files . extend ( [ path_join ( " Textures " , t ) , path_join ( " Textures.high " , t ) ] )
#if os.path.exists(path_join(input_path,".git")):
#print(input_path,"appears to be a git clone; this will work, but the result will be larger than starting from a standard flightgear-data package.\nTo create this use (adjusting paths as necessary) rsync -av --filter=\"merge /home/palmer/fs_dev/git/fgmeta/base-package.rules\" ~/fs_dev/git/fgdata ~/fs_dev/flightgear/data_full")
if os . path . exists ( output_path . format ( " base " ) ) and os . listdir ( output_path . format ( " base " ) ) :
2014-06-13 06:39:02 +08:00
print ( " output path " , output_path , " non-empty, aborting to avoid data loss \n If you did want to lose its previous contents, run: \n rm -r " , output_path , " \n then re-run this script " )
return
2014-07-28 16:10:12 +08:00
dirs = { " " : " base " }
2014-06-13 06:39:02 +08:00
while dirs :
2014-07-28 16:10:12 +08:00
cdir , cclass = dirs . popitem ( )
cdirfiles = os . listdir ( path_join ( input_path , cdir ) )
2014-06-13 06:39:02 +08:00
for file in cdirfiles :
2014-07-28 16:10:12 +08:00
fclass = subtree_class . get ( path_join ( cdir , file ) , cclass )
if os . path . isdir ( path_join ( input_path , cdir , file ) ) :
if ( path_join ( cdir , file ) not in exclude_dirs ) and ( cdir not in exclude_unnamed_subdirs or path_join ( cdir , file ) in subtree_class ) :
dirs [ path_join ( cdir , file ) ] = fclass
else : #file
if split_textures and fclass == " textures " :
if path_join ( cdir , file ) in base_texture_files :
fclass = " base-textures "
else :
fclass = " extra-textures "
if fclass in exclude_parts :
2014-06-18 04:35:49 +08:00
continue
2014-07-28 16:10:12 +08:00
if not os . path . exists ( path_join ( output_path . format ( fclass ) , cdir ) ) :
subprocess . call ( [ " mkdir " , " -p " , path_join ( output_path . format ( fclass ) , cdir ) ] )
if ( cdir . startswith ( dirs_to_downsample ) ) and ( os . path . splitext ( file ) [ 1 ] in texture_filetypes ) and ( os . path . getsize ( path_join ( input_path , cdir , file ) ) > downsample_min_filesize ) :
2014-06-14 05:38:48 +08:00
image_type = texture_filetypes [ os . path . splitext ( file ) [ 1 ] ]
2014-07-29 03:47:43 +08:00
if " {0} " in output_path and fclass == " base-textures " : #downsampled in base-textures, full resolution in extra-textures
if not os . path . exists ( path_join ( output_path . format ( " extra-textures " ) , cdir ) ) :
subprocess . call ( [ " mkdir " , " -p " , path_join ( output_path . format ( " extra-textures " ) , cdir ) ] )
subprocess . call ( [ " cp " , path_join ( input_path , cdir , file ) , path_join ( output_path . format ( " extra-textures " ) , cdir , file ) ] )
2014-06-14 05:38:48 +08:00
if image_type == " DDS " : # in Ubuntu, neither imagemagick nor graphicsmagick can write .dds
2014-07-28 16:10:12 +08:00
#doesn't work subprocess.call(["nvzoom","-s","0.5","-f","box",path_join(input_path,cdir,file),path_join(output_path.format(fclass),cdir,file)])
if subprocess . call ( [ " convert " , image_type + " : " + path_join ( input_path , cdir , file ) , " -sample " , " 50 % " , " temp_reduced_size.png " ] ) : #fails on normal maps, so just copy them
subprocess . call ( [ " cp " , path_join ( input_path , cdir , file ) , path_join ( output_path . format ( fclass ) , cdir , file ) ] )
2014-06-14 05:38:48 +08:00
else :
2014-07-28 16:10:12 +08:00
subprocess . call ( [ " nvcompress " , " -bc3 " , " temp_reduced_size.png " , path_join ( output_path . format ( fclass ) , cdir , file ) ] )
2014-06-14 05:38:48 +08:00
else :
2014-07-28 16:10:12 +08:00
subprocess . call ( [ " convert " , image_type + " : " + path_join ( input_path , cdir , file ) , " -sample " , " 50 % " , image_type + " : " + path_join ( output_path . format ( fclass ) , cdir , file ) ] ) #we use sample rather than an averaging filter to not break mask/rotation/... maps
else : #not to be downsampled
subprocess . call ( [ " cp " , path_join ( input_path , cdir , file ) , path_join ( output_path . format ( fclass ) , cdir , file ) ] )
if " {0} " in output_path :
2014-07-29 03:47:43 +08:00
subprocess . call ( [ " mv " , path_join ( output_path . format ( " base " ) , " preferences.xml " ) , path_join ( output_path . format ( " base " ) , " preferences-regions.xml " ) ] )
2014-07-28 16:10:12 +08:00
if " extra-textures " in exclude_parts or " {0} " in output_path :
prefs_in = open ( path_join ( input_path , " preferences.xml " ) , ' r ' )
prefs_out = open ( path_join ( output_path . format ( " base " ) , " preferences-noregions.xml " if " {0} " in output_path else " preferences.xml " ) , ' w ' )
prefs_str = prefs_in . read ( None )
prefs_in . close ( )
2014-06-18 04:35:49 +08:00
prefs_str = prefs_str . replace ( " Materials/regions/materials.xml " , " Materials/default/materials.xml " ) #turn off regional textures
2014-07-28 16:10:12 +08:00
prefs_out . write ( prefs_str )
prefs_out . close ( )
def check_text_encoding ( path , filelist = None , binary_types = ( " .png " , " .dds " , " .rgb " , " .RGB " , " .jpg " , " .wav " , " .WAV " , " .btg.gz " , " .xcf.gz " , " .xcf " , " .XCF " , " Thumbs.db " , " .blend " , " .bmp " , " .gif " , " .3ds " , " .3DS " , " .pdf " , " .ttf " , " .txf " , " .htsvoice " , " .odt " , " .ods " , " .xls " , " .mp3 " , " .zip " , " .tar.gz " ) , exclude_dirs = [ " .git " , " Timezone " ] ) :
""" filelist is intended for quick testing: see fgdata_nonascii_filelist.py """
def err_context ( err ) :
start = max ( err . object . rfind ( b ' \n ' , 0 , err . start ) + 1 , err . start - 30 , 0 )
end = min ( err . object . find ( b ' \n ' , err . start ) , err . start + 30 , len ( err . object ) )
if end < 0 : #not found
end = err . start + 30
return err . object [ start : end ]
def dict_print ( d ) :
return " " . join ( i [ 0 ] + " \n \t " + str ( i [ 1 ] ) + " \n \t " + ( str ( i [ 1 ] , encoding = " utf-8 " , errors = " replace " ) + " \n \t " + str ( i [ 1 ] , encoding = " latin-1 " ) if type ( i [ 1 ] ) == bytes else " " ) + " \n " for i in sorted ( d . items ( ) ) )
if filelist is None :
filelist = [ f for f in rfilelist ( path , exclude_dirs ) if not f . endswith ( tuple ( binary_types ) ) ]
utf8_files = { }
withnulls_files = [ ]
othertext_files = { }
mislabeled_xml = { }
mislabeled_xml_nocomments = { }
xml_encoding_pattern = re . compile ( r ' < \ ?xml.*?encoding= " ( \ S+?) " .*? \ ?> ' )
xml_noencoding_pattern = re . compile ( r ' < \ ?xml.*? \ ?> ' )
utf8_files_nocomments = { }
othertext_files_nocomments = { }
for fname in filelist :
if os . path . splitext ( fname ) [ 1 ] == " .gz " :
fobj = gzip . open ( path_join ( path , fname ) , mode = ' rb ' )
else :
fobj = open ( path_join ( path , fname ) , mode = ' rb ' )
fdata = fobj . read ( )
if b " \0 " in fdata :
withnulls_files . append ( fname ) #two look like corrupted files: Aircraft/p51d/Resources/WIP/P-51D-25NA.ac (hangs gedit,large block of nulls in middle) Docs/Serial/nmeafaq.txt (block of nulls at end), rest are probably-binary types
continue
if os . path . splitext ( fname ) [ 1 ] in ( " .xml " , " .svg " , " .xhtml " ) :
encoding_mark = xml_encoding_pattern . search ( str ( fdata . split ( b ' \n ' , maxsplit = 1 ) [ 0 ] , encoding = " utf-8 " ) )
if encoding_mark :
encoding_mark = encoding_mark . group ( 1 )
if encoding_mark not in ( " utf-8 " , " UTF-8 " , " ISO-8859-1 " ) :
mislabeled_xml_nocomments [ fname ] = " unrecognised encoding " + encoding_mark
encoding_mark = None
else :
if xml_noencoding_pattern . search ( str ( fdata . split ( b ' \n ' , maxsplit = 1 ) [ 0 ] , encoding = " utf-8 " ) ) :
encoding_mark = " utf-8 " #XML standard allows either UTF-8 or UTF-16 (with BOM) in unlabeled files, but we only use -8
else :
encoding_mark = None
#mislabeled_xml_nocomments[fname]="no xml header"
else :
encoding_mark = None
try :
fdata . decode ( encoding = " ascii " )
continue
except UnicodeError as err :
errline = err_context ( err )
try :
fdata . decode ( encoding = " utf-8 " )
utf8_files [ fname ] = errline
if encoding_mark not in ( " utf-8 " , " UTF-8 " , None ) :
mislabeled_xml [ fname ] = bytes ( encoding_mark , encoding = " ascii " ) + errline
except UnicodeError as err :
errline = err_context ( err )
othertext_files [ fname ] = errline
if encoding_mark not in ( " ISO-8859-1 " , None ) :
mislabeled_xml [ fname ] = bytes ( encoding_mark , encoding = " ascii " ) + errline
if os . path . basename ( fname ) in ( " Read-Me.txt " , " README.txt " , " Readme.txt " , " readme.txt " , " LIS-MOI_GNU-GPL " ) :
continue
fdata_nocomments = strip_comments ( fdata , filename = fname )
if fdata_nocomments . startswith ( bytes ( [ 0xef , 0xbb , 0xbf ] ) ) and fname not in mislabeled_xml : #UTF-8 BOM
fdata_nocomments = fdata_nocomments [ 3 : ]
try :
fdata_nocomments . decode ( encoding = " ascii " )
continue
except UnicodeError as err :
errline = err_context ( err )
try :
fdata_nocomments . decode ( encoding = " utf-8 " )
if encoding_mark is None :
utf8_files_nocomments [ fname ] = errline
if encoding_mark not in ( " utf-8 " , " UTF-8 " , None ) :
mislabeled_xml_nocomments [ fname ] = bytes ( encoding_mark , encoding = " ascii " ) + errline
except UnicodeError as err :
errline = err_context ( err )
if encoding_mark is None :
othertext_files_nocomments [ fname ] = errline
if encoding_mark not in ( " ISO-8859-1 " , None ) :
mislabeled_xml_nocomments [ fname ] = bytes ( encoding_mark , encoding = " ascii " ) + errline
print ( " non-ASCII valid UTF-8: " , dict_print ( utf8_files ) , " \n \n other: " , dict_print ( othertext_files ) , " \n \n mislabeled/unrecognised " , dict_print ( mislabeled_xml ) , " \n \n with nulls (binary or UTF-16/32): " , sorted ( withnulls_files ) , " \n \n non-ASCII valid UTF-8 (outside BOM/comments): " , dict_print ( utf8_files_nocomments ) , " \n \n other (outside comments): " , dict_print ( othertext_files_nocomments ) , " \n \n mislabeled/unrecognised (outside comments) " , dict_print ( mislabeled_xml_nocomments ) )