这学期参***务外包大赛,具体要实现对非结构化数据的分析处理,所以在这里把这个过程一点点记录一下。

首先根据python的爬虫框架,从网页上获取了中文文本

但是由于我不怎么会处理中文数据,摸索了很久,简单的通过java的substring把数据分开

package se;
import java.io.File; 
import java.io.InputStreamReader; 
import java.io.BufferedReader; 
import java.io.BufferedWriter; 
import java.io.FileInputStream; 
import java.io.FileWriter; 

public class sdf {  
    public static void main(String args[]) {  
        try { 
            String pathname = "info.txt"; 
            File filename = new File(pathname);
            InputStreamReader reader = new InputStreamReader(  
                    new FileInputStream(filename)); 
            BufferedReader br = new BufferedReader(reader);
            String line = ""; 
            File writename = new File("output1.txt"); // 相对路径,如果没有则要建立一个新的output。txt文件 
            writename.createNewFile(); // 创建新文件 
            BufferedWriter out = new BufferedWriter(new FileWriter(writename)); 

            line = br.readLine(); 
            // System.out.println(line);
            out.write(line);
            out.write("\r\n");
            out.write(" \r\n");
             while (line != null) {  
                 line = br.readLine(); // 一次读入一行数据 
               //  System.out.println(line);
                 int b=0;

                 for(int i=0;i<line.length();i++)
                 {
                    if(line.substring(i,i+1).equalsIgnoreCase(":"))
                        b=i;

                 }
                 // System.out.println(b);
                 // System.out.println(line.length());
                 if(b==0||b==line.length()-1)continue;
                 else
                 {
                    System.out.print(line.substring(0,b));
                    out.write(line.substring(0,b));
                    //out.flush();
                    for(int i=1;i<=20-b;i++)
                    {
                    System.out.printf(" ");
                    out.write(" ");
                    }
                    System.out.print(line.substring(b+1, line.length()));
                    out.write(line.substring(b+1, line.length()));
                    out.write("\r\n");
                    //System.out.printf("\t");
                    System.out.printf("\n");
                    out.flush();
                 }
                 //System.out.println(b);
             }  

            out.close(); // 最后记得关闭文件 

        } catch (Exception e) {  
            e.printStackTrace(); 
        }  
    }  
}  

然后再将数据分开,由于中间有空格,导入到excel中