python文件排序的方法总结

avatar 2020年11月27日10:14:43 评论 160

在python环境中提供两种排序方案:用库函数sorted()对字符串排序,它的对象是字符;用函数sort()对数字排序,它的对象是数字,如果读取文件的话,需要进行处理(把文件后缀名‘屏蔽')。

(1)首先:我测试的文件夹是/img/,里面的文件都是图片,如下图所示:

5e456c82768e849deb5d28f4bf56f0f.png

(2)测试库函数sorted(),直接贴出代码:

import numpy as np
import os
 
img_path='./img/'
 
img_list=sorted(os.listdir(img_path))#文件名按字母排序
img_nums=len(img_list)
for i in range(img_nums):
    img_name=img_path+img_list[i]
    print(img_name)

运行效果如下:

d1bae9b804c553b4d24a6e0381710b4.png

从图片可以清晰的看出,文件名是按字符排序的。

(3)测试函数sort(),代码:

import numpy as np
import os
img_path='./img/'
 
img_list=os.listdir(img_path)
img_list.sort()
img_list.sort(key = lambda x: int(x[:-4])) ##文件名按数字排序
img_nums=len(img_list)
for i in range(img_nums):
    img_name=img_path+img_list[i]
    print(img_name)

运行效果如下:

f2e1a74694600bcc7591d32a02c30ba.png

可以看出,文件名是按数字排序的;顺便提下,sort函数中用到了匿名函数(key = lambda x:int(x[:-4])),其作用是将后缀名'.jpg'“屏蔽”(因为‘.jpg'是4个字符,所以[:-4]的含义是从文件名开始到倒数第四个字符为止),具体看python的匿名函数和数组取值方式。

实例扩展:

import gzip
import os
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
from datetime import datetime
def sort_worker(input,output):
 while True:
 lines = input.get().splitlines()
 element_set = {}
 for line in lines:
  if line.strip() == 'STOP':
   return
  try:
   element = line.split(' ')[0]
   if not element_set.get(element): element_set[element] = ''
  except:
   pass
 sorted_element = sorted(element_set)
 #print sorted_element
 output.put('n'.join(sorted_element))
def write_worker(input, pre):
 os.system('mkdir %s'%pre)
 i = 0
 while True:
  content = input.get()
  if content.strip() == 'STOP':
   return
  write_sorted_bulk(content, '%s/%s'%(pre, i))
  i += 1
def write_sorted_bulk(content, filename):
 f = file(filename, 'w')
 f.write(content)
 f.close()
def split_sort_file(filename, num_sort = 3, buf_size = 65536*64*4):
 t = datetime.now()
 pre, ext = os.path.splitext(filename)
 if ext == '.gz':
  file_file = gzip.open(filename, 'rb')
 else:
  file_file = open(filename)
 bulk_queue = Queue(10)
 sorted_queue = Queue(10)
 NUM_SORT = num_sort
 sort_worker_pool = []
 for i in range(NUM_SORT):
  sort_worker_pool.append( Process(target=sort_worker, args=(bulk_queue, sorted_queue)) )
  sort_worker_pool[i].start()
 NUM_WRITE = 1
 write_worker_pool = []
 for i in range(NUM_WRITE):
  write_worker_pool.append( Process(target=write_worker, args=(sorted_queue, pre)) )
  write_worker_pool[i].start()
 buf = file_file.read(buf_size)
 sorted_count = 0
 while len(buf):
  end_line = buf.rfind('n')
  #print buf[:end_line+1]
  bulk_queue.put(buf[:end_line+1])
  sorted_count += 1
  if end_line != -1:
   buf = buf[end_line+1:] + file_file.read(buf_size)
  else:
   buf = file_file.read(buf_size)
 for i in range(NUM_SORT):
  bulk_queue.put('STOP')
 for i in range(NUM_SORT):
  sort_worker_pool[i].join()
  
 for i in range(NUM_WRITE):
  sorted_queue.put('STOP')
 for i in range(NUM_WRITE):
  write_worker_pool[i].join()
 print 'elasped ', datetime.now() - t
 return sorted_count
from heapq import heappush, heappop
from datetime import datetime
from multiprocessing import Process, Queue, Pipe, current_process, freeze_support
import os
class file_heap:
 def __init__(self, dir, idx = 0, count = 1):
  files = os.listdir(dir)
  self.heap = []
  self.files = {}
  self.bulks = {}
  self.pre_element = None
  for i in range(len(files)):
   file = files[i]
   if hash(file) % count != idx: continue
   input = open(os.path.join(dir, file))
   self.files[i] = input
   self.bulks[i] = ''
   heappush(self.heap, (self.get_next_element_buffered(i), i))
 def get_next_element_buffered(self, i):
  if len(self.bulks[i])  1024:
   self.q.put(self.wbuf)
   self.wbuf = []
def diff_file(file_old, file_new, file_diff, buf = 268435456):
 print 'buffer size', buf
 from file_split import split_sort_file
 os.system('rm -rf '+ os.path.splitext(file_old)[0] )
 os.system('rm -rf '+ os.path.splitext(file_new)[0] )
 t = datetime.now()
 split_sort_file(file_old,5,buf)
 split_sort_file(file_new,5,buf)
 print 'split elasped ', datetime.now() - t
 os.system('cat %s/* | wc -l'%os.path.splitext(file_old)[0])
 os.system('cat %s/* | wc -l'%os.path.splitext(file_new)[0])
 os.system('rm -f '+file_diff)
 t = datetime.now()
 zdiff = open(file_diff, 'a')
 old_q = Queue(1024)
 new_q = Queue(1024)
 old_queue = queue_buffer(old_q)
 new_queue = queue_buffer(new_q)
 h1 = Process(target=heappoppush2, args=(os.path.splitext(file_old)[0], old_queue, 3))
 h2 = Process(target=heappoppush2, args=(os.path.splitext(file_new)[0], new_queue, 3))
 h1.start(), h2.start()
 old = old_queue.get()
 new = new_queue.get()
 old_count, new_count = 0, 0
 while old is not None or new is not None:
  if old > new or old is None:
   zdiff.write(' '+old+'n')
   old = old_queue.get()
   old_count +=1
  else:
   old = old_queue.get()
   new = new_queue.get()
 print 'new_count:', new_count
 print 'old_count:', old_count
 print 'diff elasped ', datetime.now() - t
 h1.join(), h2.join()

文章来源于互联网:python文件排序的方法总结

avatar

发表评论

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen: