爬取模块(WebMagic)
一、概述
该项目的职位数据主要来自爬虫自动爬取,现已对拉勾网、51job等网站的职位、公司信息进行成功爬取。本爬虫模块采用的是WebMagic框架,使用HttpClient生成post请求,然后将爬取的信息筛选存储至MySQL数据库。其中souzhi-crawler-service源代码可自行点击查看。
二、组织架构
souzhi-crawler-service
│  pom.xml
│  
└─src
    ├─main
    │  ├─java
    │  │  └─com
    │  │      └─couragehe
    │  │          └─souzhi
    │  │              │  SouzhiCrawlerServiceApplication.java
    │  │              │  
    │  │              └─crawler
    │  │                  ├─mapper
    │  │                  │      CompanyMapper.java
    │  │                  │      PositionDetailMapper.java
    │  │                  │      PositionMapper.java
    │  │                  │      
    │  │                  └─task
    │  │                      │  Application.java
    │  │                      │  TaskTest.java
    │  │                      │  WebsiteDownloader.java
    │  │                      │  WebsitePipeline.java
    │  │                      │  WebsiteProcessor.java
    │  │                      │  
    │  │                      └─website
    │  │                              Job51Spider.java
    │  │                              LagouSpider.java
    │  │                              WebsiteSpider.java
    │  │                              
    │  └─resources
    │      │  application.properties
    │      │  
    │      ├─mapper
    │      │      CompanyMapper.xml
    │      │      PositionDetailMapper.xml
    │      │      PositionMapper.xml
    │      │      
    │      ├─static
    │      └─templates
    └─test
        └─java
            └─com
                └─couragehe
                    └─souzhi
                            CrawlerTest.java
                            SouzhiCrawlerServiceApplicationTests.java 三、maven依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>com.couragehe.souzhi</groupId>
        <artifactId>souzhi-parent</artifactId>
        <version>0.0.1-SNAPSHOT</version>
    </parent>
    <groupId>com.couragehe.souzhi</groupId>
    <artifactId>souzhi-crawler-service</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>souzhi-crawler-service</name>
    <description>Demo project for Spring Boot</description>
    <properties>
        <java.version>1.8</java.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>com.couragehe.souzhi</groupId>
            <artifactId>souzhi-api</artifactId>
            <version>0.0.1-SNAPSHOT</version>
            <exclusions>
                <exclusion>
                    <groupId>ch.qos.logback</groupId>
                    <artifactId>logback-classic</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>log4j-over-slf4j</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>com.couragehe.souzhi</groupId>
            <artifactId>souzhi-service-util</artifactId>
            <version>0.0.1-SNAPSHOT</version>
        </dependency>
        <!-- SpringBoot 热部署插件       -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-devtools</artifactId>
            <optional>true</optional>
        </dependency>
        <!-- WebMagic核心包 -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>
</project>
 四、数据库设计
1、数据库模型

2、SQL源码
-- ---------------------------- -- Table structure for company -- ---------------------------- DROP TABLE IF EXISTS `company`; CREATE TABLE `company` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_logo` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司图标', `company_size` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司规模', `industry_field` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '行业领域', `finance_stage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '融资阶段', `company_label_list` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司优待标签', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact; -- ---------------------------- -- Table structure for position -- ---------------------------- DROP TABLE IF EXISTS `position`; CREATE TABLE `position` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `position_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司名称', `skill_lables` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '技能清单', `create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职务创建时间', `format_create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '格式化后的时间', `work_city` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地点', `work_district` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地区', `work_salary` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作薪资', `work_year` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作经验', `work_nature` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作性质', `education_require` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT ***历要求', `position_advantage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位优势', `is_school_job` int(32) NULL DEFAULT 0 COMMENT '是否校招', `detail_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '详情链接', `origin_website` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '来源网站', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact; -- ---------------------------- -- Table structure for position_detail -- ---------------------------- DROP TABLE IF EXISTS `position_detail`; CREATE TABLE `position_detail` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `position_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `position_desc` text CHARACTER SET utf8 COLLATE utf8_bin NULL COMMENT '职位描述', `position_address` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位具体地址' ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact;

 京公网安备 11010502036488号
京公网安备 11010502036488号