Python 数据处理，切片，替换，去重，排序-阿里云开发者社区

Python 数据处理，切片，替换，去重，排序

2017-11-13 1405

版权

本文内容由阿里云实名注册用户自发贡献，版权归原作者所有，阿里云开发者社区不拥有其著作权，亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容，填写侵权投诉表单进行举报，一经查实，本社区将立刻删除涉嫌侵权内容。

简介：

一、把下面这组数据进行处理，进行规则排序。

第一版代码：

 
          #!/usr/local/python3/bin/python3
         
          #定义了一个函数，用来做数据的分隔符转换，保证数据风格的一致，才能有效排序。
         
          def 
          sanitize(time_string):    
         
          if 
          '-' 
          in 
          time_string:     
         
          splitter
          =
          '-' 
         
          elif 
          ':' 
          in 
          time_string: 
         
          splitter
          =
          ':' 
         
          else
          : 
         
          return
          (time_string) 
         
          #将取到的一段数据，通过判断中定义的分割符切开为两组，分别报存到mins，secs变量中。
         
          (mins,secs) 
          = 
          time_string.split(splitter)  
         
          #在返回结果的时候，完成字符串合并。
         
          return
          (mins 
          + 
          '.' 
          + 
          secs)    
         
          #使用with open打开文件，和open不同于with open会自动关闭文件，不需要手动关闭。    
         
          with 
          open
          (
          'james'
          ) as jam: 
         
          data 
          = 
          jam.readline() 
         
          #把数据进行首尾去空格，使用，最为分隔符切分。    
         
          james1 
          = 
          data.strip().split(
          ','
          ) 
         
          #这里也可以通过这种方式排序，在最后输出的时候，直接输出james2就可以了。
         
          #james2 = sorted(james1)
         
          with 
          open
          (
          'julie'
          ) as jul: 
         
          data 
          = 
          jul.readline() 
         
          julie1 
          = 
          data.strip().split(
          ','
          ) 
         
          #julie2 = sorted(julie1)
         
          with 
          open
          (
          'mikey'
          ) as mik: 
         
          data 
          = 
          mik.readline() 
         
          mikey1 
          = 
          data.strip().split(
          ','
          ) 
         
          #mikey2 = sorted(mikey1)
         
          with 
          open
          (
          'sarah'
          ) as sar: 
         
          data 
          = 
          sar.readline() 
         
          sarah1 
          = 
          data.strip().split(
          ','
          ) 
         
          #sarah2 = sorted(sarah1)
         
          clean_james
          =
          [] 
         
          clean_julie
          =
          [] 
         
          clean_mikey
          =
          [] 
         
          clean_sarah
          =
          [] 
         
          #通过迭代每组数据，调用sanitize函数，再把转换好的数据添加到新的列表中，这样列表中的数据风格就是一致的。
         
          for 
          each_t 
          in 
          james1: 
         
          clean_james.append(sanitize(each_t))   
         
          for 
          each_t 
          in 
          julie1: 
         
          clean_julie.append(sanitize(each_t)) 
         
          for 
          each_t 
          in 
          mikey1: 
         
          clean_mikey.append(sanitize(each_t)) 
         
          for 
          each_t 
          in 
          sarah1: 
         
          clean_sarah.append(sanitize(each_t)) 
         
          #输出新列表并排序
         
          print
          (
          sorted
          (clean_james))  
         
          print
          (
          sorted
          (clean_julie)) 
         
          print
          (
          sorted
          (clean_mikey)) 
         
          print
          (
          sorted
          (clean_sarah))

输出结果：

这就完成了规则排序。

二、需要给数据去重复，排序，只输出前三项数据。

第二版代码：

 
          #!/usr/local/python3/bin/python3
         
          def 
          sanitize(time_string): 
         
          if 
          '-' 
          in 
          time_string: 
         
          splitter
          =
          '-' 
         
          elif 
          ':' 
          in 
          time_string: 
         
          splitter
          =
          ':' 
         
          else
          : 
         
          return
          (time_string) 
         
          (mins,secs) 
          = 
          time_string.split(splitter) 
         
          return
          (mins 
          + 
          '.' 
          + 
          secs) 
         
          #定义函数来解决去除重复数据项的问题，此函数接受两个列表作为参数带入，当数据不存在新列表中就把数据添加到新列表，如果有存在则不会添加，代替了下面使用每个列表使用for迭代的方式，代码更简洁。    
         
          def 
          pomoto(old_list,new_list): 
         
          for 
          i 
          in 
          old_list: 
         
          if 
          i 
          not 
          in 
          new_list: 
         
          new_list.append(i) 
         
          with 
          open
          (
          'james'
          ) as jam: 
         
          data 
          = 
          jam.readline() 
         
          james1 
          = 
          data.strip().split(
          ','
          ) 
         
          with 
          open
          (
          'julie'
          ) as jul: 
         
          data 
          = 
          jul.readline() 
         
          julie1 
          = 
          data.strip().split(
          ','
          ) 
         
          with 
          open
          (
          'mikey'
          ) as mik: 
         
          data 
          = 
          mik.readline() 
         
          mikey1 
          = 
          data.strip().split(
          ','
          ) 
         
          with 
          open
          (
          'sarah'
          ) as sar: 
         
          data 
          = 
          sar.readline() 
         
          sarah1 
          = 
          data.strip().split(
          ','
          ) 
         
          unique_james 
          = 
          [] 
         
          unique_julie 
          = 
          [] 
         
          unique_sarah 
          = 
          [] 
         
          unique_mikey 
          = 
          [] 
         
          #从迭代的方式改为了列表推导的方式
         
          clean_james 
          = 
          sorted
          ([sanitize(each_t) 
          for 
          each_t 
          in 
          james1]) 
         
          clean_julie 
          = 
          sorted
          ([sanitize(each_t) 
          for 
          each_t 
          in 
          julie1]) 
         
          clean_mikey 
          = 
          sorted
          ([sanitize(each_t) 
          for 
          each_t 
          in 
          mikey1]) 
         
          clean_sarah 
          = 
          sorted
          ([sanitize(each_t) 
          for 
          each_t 
          in 
          sarah1]) 
         
          #调用函数完成去重复
         
          pomoto(clean_james,unique_james)
         
          pomoto(clean_julie,unique_julie)
         
          pomoto(clean_mikey,unique_mikey)
         
          pomoto(clean_sarah,unique_sarah)
         
          #每列表迭代方式的去重复
         
          #for i in clean_james:
         
          #    if i not in unique_james:
         
          #        unique_james.append(i)
         
          #for i in clean_julie:
         
          #    if i not in unique_julie:
         
          #        unique_julie.append(i)
         
          #for i in clean_mikey:
         
          #    if i not in unique_mikey:
         
          #        unique_mikey.append(i)
         
          #for i in clean_sarah:
         
          #    if i not in unique_sarah:
         
          #        unique_sarah.append(i)
         
          #输出去重复后的唯一新列表，只打印前三项数据
         
          print
          (unique_james[
          0
          :
          3
          ]) 
         
          print
          (unique_julie[
          0
          :
          3
          ]) 
         
          print
          (unique_mikey[
          0
          :
          3
          ]) 
         
          print
          (unique_sarah[
          0
          :
          3
          ])

输出结果：

三、使用集合删除重复项，将重复的with open定义为函数，简洁代码，进行逆序排序，输出前三项。

第三版代码：

 
    
      
        
       
          #!/usr/local/python3/bin/python3
         
 
          def 
          sanitize(time_string): 
         
 
              
          if 
          '-' 
          in 
          time_string: 
         
 
                  
          splitter
          =
          '-' 
         
 
              
          elif 
          ':' 
          in 
          time_string: 
         
 
                  
          splitter
          =
          ':' 
         
 
              
          else
          : 
         
 
                  
          return
          (time_string) 
         
 
              
          (mins,secs) 
          = 
          time_string.split(splitter) 
         
 
              
          return
          (mins 
          + 
          '.' 
          + 
          secs) 
         
 
               
         

          #定义函数打开文件获取数据返回，并加入了错误处理代码。
         
 
          def 
          get_file_data(filename): 
         
 
              
          try
          : 
         
 
                  
          with 
          open
          (filename) as f: 
         
 
                      
          data 
          = 
          f.readline() 
         
 
                  
          return
          (data.strip().split(
          ','
          )) 
         
 
              
          except 
          IOError as ioerr: 
         
 
                  
          print
          (
          'File error' 
          + 
          str
          (ioerr)) 
         
 
                  
          return
          (
          None
          ) 
         

           
         

          #调用函数直接得到文件中经过切片后的数据。  
         
 
          james1 
          = 
          get_file_data(
          'james'
          ) 
         
 
          julie1 
          = 
          get_file_data(
          'julie'
          ) 
         
 
          mikey1 
          = 
          get_file_data(
          'mikey'
          ) 
         
 
          sarah1 
          = 
          get_file_data(
          'sarah'
          ) 
         

           
         
 
          print
          (
          sorted
          (
          set
          ([sanitize(i) 
          for 
          i 
          in 
          james1]),reverse
          =
          True
          )[
          0
          :
          3
          ]) 
         
 
          print
          (
          sorted
          (
          set
          ([sanitize(i) 
          for 
          i 
          in 
          julie1]),reverse
          =
          True
          )[
          0
          :
          3
          ]) 
         
 
          print
          (
          sorted
          (
          set
          ([sanitize(i) 
          for 
          i 
          in 
          mikey1]),reverse
          =
          True
          )[
          0
          :
          3
          ]) 
         
 
          print
          (
          sorted
          (
          set
          ([sanitize(i) 
          for 
          i 
          in 
          sarah1]),reverse
          =
          True
          )[
          0
          :
          3
          ])