爬取模块(WebMagic)

一、概述

  该项目的职位数据主要来自爬虫自动爬取,现已对拉勾网、51job等网站的职位、公司信息进行成功爬取。本爬虫模块采用的是WebMagic框架,使用HttpClient生成post请求,然后将爬取的信息筛选存储至MySQL数据库。其中souzhi-crawler-service源代码可自行点击查看。

二、组织架构

souzhi-crawler-service
│  pom.xml
│  
└─src
    ├─main
    │  ├─java
    │  │  └─com
    │  │      └─couragehe
    │  │          └─souzhi
    │  │              │  SouzhiCrawlerServiceApplication.java
    │  │              │  
    │  │              └─crawler
    │  │                  ├─mapper
    │  │                  │      CompanyMapper.java
    │  │                  │      PositionDetailMapper.java
    │  │                  │      PositionMapper.java
    │  │                  │      
    │  │                  └─task
    │  │                      │  Application.java
    │  │                      │  TaskTest.java
    │  │                      │  WebsiteDownloader.java
    │  │                      │  WebsitePipeline.java
    │  │                      │  WebsiteProcessor.java
    │  │                      │  
    │  │                      └─website
    │  │                              Job51Spider.java
    │  │                              LagouSpider.java
    │  │                              WebsiteSpider.java
    │  │                              
    │  └─resources
    │      │  application.properties
    │      │  
    │      ├─mapper
    │      │      CompanyMapper.xml
    │      │      PositionDetailMapper.xml
    │      │      PositionMapper.xml
    │      │      
    │      ├─static
    │      └─templates
    └─test
        └─java
            └─com
                └─couragehe
                    └─souzhi
                            CrawlerTest.java
                            SouzhiCrawlerServiceApplicationTests.java

三、maven依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <parent>
        <groupId>com.couragehe.souzhi</groupId>
        <artifactId>souzhi-parent</artifactId>
        <version>0.0.1-SNAPSHOT</version>
    </parent>
    <groupId>com.couragehe.souzhi</groupId>
    <artifactId>souzhi-crawler-service</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <name>souzhi-crawler-service</name>
    <description>Demo project for Spring Boot</description>

    <properties>
        <java.version>1.8</java.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>com.couragehe.souzhi</groupId>
            <artifactId>souzhi-api</artifactId>
            <version>0.0.1-SNAPSHOT</version>
            <exclusions>
                <exclusion>
                    <groupId>ch.qos.logback</groupId>
                    <artifactId>logback-classic</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>log4j-over-slf4j</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <dependency>
            <groupId>com.couragehe.souzhi</groupId>
            <artifactId>souzhi-service-util</artifactId>
            <version>0.0.1-SNAPSHOT</version>
        </dependency>
        <!-- SpringBoot 热部署插件       -->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-devtools</artifactId>
            <optional>true</optional>
        </dependency>
        <!-- WebMagic核心包 -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>

</project>

四、数据库设计

1、数据库模型

TIM截图20200416000551.png

2、SQL源码

-- ----------------------------
-- Table structure for company
-- ----------------------------
DROP TABLE IF EXISTS `company`;
CREATE TABLE `company`  (
  `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
  `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
  `company_logo` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司图标',
  `company_size` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司规模',
  `industry_field` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '行业领域',
  `finance_stage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '融资阶段',
  `company_label_list` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司优待标签',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for position
-- ----------------------------
DROP TABLE IF EXISTS `position`;
CREATE TABLE `position`  (
  `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
  `position_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
  `company_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
  `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司名称',
  `skill_lables` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '技能清单',
  `create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职务创建时间',
  `format_create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '格式化后的时间',
  `work_city` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地点',
  `work_district` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地区',
  `work_salary` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作薪资',
  `work_year` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作经验',
  `work_nature` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作性质',
  `education_require` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT ***历要求',
  `position_advantage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位优势',
  `is_school_job` int(32) NULL DEFAULT 0 COMMENT '是否校招',
  `detail_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '详情链接',
  `origin_website` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '来源网站',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact;

-- ----------------------------
-- Table structure for position_detail
-- ----------------------------
DROP TABLE IF EXISTS `position_detail`;
CREATE TABLE `position_detail`  (
  `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,
  `position_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL,
  `position_desc` text CHARACTER SET utf8 COLLATE utf8_bin NULL COMMENT '职位描述',
  `position_address` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位具体地址'
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact;

五、参考资料

WebMagic使用小结

拉勾网爬取(HttpClient)