爬取模块(WebMagic)
一、概述
该项目的职位数据主要来自爬虫自动爬取,现已对拉勾网、51job等网站的职位、公司信息进行成功爬取。本爬虫模块采用的是WebMagic框架,使用HttpClient生成post请求,然后将爬取的信息筛选存储至MySQL数据库。其中souzhi-crawler-service源代码可自行点击查看。
二、组织架构
souzhi-crawler-service │ pom.xml │ └─src ├─main │ ├─java │ │ └─com │ │ └─couragehe │ │ └─souzhi │ │ │ SouzhiCrawlerServiceApplication.java │ │ │ │ │ └─crawler │ │ ├─mapper │ │ │ CompanyMapper.java │ │ │ PositionDetailMapper.java │ │ │ PositionMapper.java │ │ │ │ │ └─task │ │ │ Application.java │ │ │ TaskTest.java │ │ │ WebsiteDownloader.java │ │ │ WebsitePipeline.java │ │ │ WebsiteProcessor.java │ │ │ │ │ └─website │ │ Job51Spider.java │ │ LagouSpider.java │ │ WebsiteSpider.java │ │ │ └─resources │ │ application.properties │ │ │ ├─mapper │ │ CompanyMapper.xml │ │ PositionDetailMapper.xml │ │ PositionMapper.xml │ │ │ ├─static │ └─templates └─test └─java └─com └─couragehe └─souzhi CrawlerTest.java SouzhiCrawlerServiceApplicationTests.java
三、maven依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> <groupId>com.couragehe.souzhi</groupId> <artifactId>souzhi-parent</artifactId> <version>0.0.1-SNAPSHOT</version> </parent> <groupId>com.couragehe.souzhi</groupId> <artifactId>souzhi-crawler-service</artifactId> <version>0.0.1-SNAPSHOT</version> <name>souzhi-crawler-service</name> <description>Demo project for Spring Boot</description> <properties> <java.version>1.8</java.version> </properties> <dependencies> <dependency> <groupId>com.couragehe.souzhi</groupId> <artifactId>souzhi-api</artifactId> <version>0.0.1-SNAPSHOT</version> <exclusions> <exclusion> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>log4j-over-slf4j</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>com.couragehe.souzhi</groupId> <artifactId>souzhi-service-util</artifactId> <version>0.0.1-SNAPSHOT</version> </dependency> <!-- SpringBoot 热部署插件 --> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-devtools</artifactId> <optional>true</optional> </dependency> <!-- WebMagic核心包 --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-maven-plugin</artifactId> </plugin> </plugins> </build> </project>
四、数据库设计
1、数据库模型
2、SQL源码
-- ---------------------------- -- Table structure for company -- ---------------------------- DROP TABLE IF EXISTS `company`; CREATE TABLE `company` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_logo` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司图标', `company_size` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司规模', `industry_field` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '行业领域', `finance_stage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '融资阶段', `company_label_list` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司优待标签', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact; -- ---------------------------- -- Table structure for position -- ---------------------------- DROP TABLE IF EXISTS `position`; CREATE TABLE `position` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `position_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `company_name` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '公司名称', `skill_lables` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '技能清单', `create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职务创建时间', `format_create_time` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '格式化后的时间', `work_city` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地点', `work_district` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作地区', `work_salary` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作薪资', `work_year` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作经验', `work_nature` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '工作性质', `education_require` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT ***历要求', `position_advantage` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位优势', `is_school_job` int(32) NULL DEFAULT 0 COMMENT '是否校招', `detail_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '详情链接', `origin_website` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '来源网站', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact; -- ---------------------------- -- Table structure for position_detail -- ---------------------------- DROP TABLE IF EXISTS `position_detail`; CREATE TABLE `position_detail` ( `id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL, `position_id` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL, `position_desc` text CHARACTER SET utf8 COLLATE utf8_bin NULL COMMENT '职位描述', `position_address` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NULL DEFAULT NULL COMMENT '职位具体地址' ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_bin ROW_FORMAT = Compact;