http_parser源码解读

把源码中的头文件http_parser.h和源码http_parser.c直接拷贝到项目中(https://github.com/nodejs/http-parser),然后一起编译即可;

我们写一个简单地测试例子:

main.c

代码语言:javascript
复制
#include "http_parser.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <time.h>

static http_parser *parser;

int on_message_begin(http_parser* ) {
(void)
;
printf("\nMESSAGE BEGIN\n\n");
return 0;
}

int on_headers_complete(http_parser* ) {
(void)
;
printf("\nHEADERS COMPLETE\n\n");
return 0;
}

int on_message_complete(http_parser* ) {
(void)
;
printf("\nMESSAGE COMPLETE\n\n");
return 0;
}

int on_url(http_parser* , const char* at, size_t length) {
(void)
;
printf("Url: %.*s\n", (int)length, at);

return 0;
}

int on_header_field(http_parser* , const char* at, size_t length) {
(void)
;
printf("Header field: %.*s\n", (int)length, at);
return 0;
}

int on_header_value(http_parser* , const char* at, size_t length) {
(void)
;
printf("Header value: %.*s\n", (int)length, at);
return 0;
}

int on_body(http_parser* , const char* at, size_t length) {
(void)
;
printf("Body: %.*s\n", (int)length, at);
return 0;
}

int main() {
http_parser_settings parser_set;

// http_parser的回调函数,需要获取HEADER后者BODY信息,可以在这里面处理。
parser_set.on_message_begin = on_message_begin;
parser_set.on_header_field = on_header_field;
parser_set.on_header_value = on_header_value;
parser_set.on_url = on_url;
parser_set.on_body = on_body;
parser_set.on_headers_complete = on_headers_complete;
parser_set.on_message_complete = on_message_complete;


char  buf[1024]=&#34;GET /a/b/c/d HTTP/1.1&#34;;

size_t parsed;
parser = (http_parser*)malloc(sizeof(http_parser)); // 分配一个http_parser
		 
http_parser_init(parser, HTTP_REQUEST); // 初始化parser为Request类型
parsed = http_parser_execute(parser, &amp;parser_set, buf, strlen(buf)); // 执行解析过程

http_parser_execute(parser, &amp;parser_set, buf, 0); // 信息读取完毕

free(parser);
parser = NULL;

}

使用主要分三步:

1. 申请一块http_parser大小的内存作为当前请求的parser对象,里面包含了对这次请求的解析信息;

2.申请一块http_parser_settings大小内存作为设置对象,它包含了我们设置的各种回调函数;

3.调用http_parser_execute解析请求串,根据parsed与总字符是否相等来判断是否成功还是失败;

下面我们为源码添加一些注释,大家自行对照代码去分析里面的状态机的转化过程。

http_parser.h

代码语言:javascript
复制
/* Copyright Joyent, Inc. and other Node contributors. All rights reserved.
*

  • Permission is hereby granted, free of charge, to any person obtaining a copy
  • of this software and associated documentation files (the "Software"), to
  • deal in the Software without restriction, including without limitation the
  • rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  • sell copies of the Software, and to permit persons to whom the Software is
  • furnished to do so, subject to the following conditions:
  • The above copyright notice and this permission notice shall be included in
  • all copies or substantial portions of the Software.
  • THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  • IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  • FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  • AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  • LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  • FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  • IN THE SOFTWARE.
    */
    #ifndef http_parser_h
    #define http_parser_h
    #ifdef __cplusplus
    extern "C"
    {
    #endif

/* Also update SONAME in the Makefile whenever you change these. */
#define HTTP_PARSER_VERSION_MAJOR 2
#define HTTP_PARSER_VERSION_MINOR 9
#define HTTP_PARSER_VERSION_PATCH 4

#include <stddef.h>
#if defined(_WIN32) && !defined(MINGW32) &&
(!defined(_MSC_VER) || _MSC_VER < 1600) && !defined(WINE)
#include <BaseTsd.h>
typedef __int8 int8_t;
typedef unsigned __int8 uint8_t;
typedef __int16 int16_t;
typedef unsigned __int16 uint16_t;
typedef __int32 int32_t;
typedef unsigned __int32 uint32_t;
typedef __int64 int64_t;
typedef unsigned __int64 uint64_t;
#elif (defined(__sun) || defined(sun)) && defined(__SunOS_5_9)
#include <sys/inttypes.h>
#else
#include <stdint.h>
#endif

/* Compile with -DHTTP_PARSER_STRICT=0 to make less checks, but run

  • faster
    */
    #ifndef HTTP_PARSER_STRICT
    #define HTTP_PARSER_STRICT 1
    #endif

/* Maximium header size allowed. If the macro is not defined

  • before including this header then the default is used. To

  • change the maximum header size, define the macro in the build

  • environment (e.g. -DHTTP_MAX_HEADER_SIZE=<value>). To remove

  • the effective limit on the size of the header, define the macro

  • to a very large number (e.g. -DHTTP_MAX_HEADER_SIZE=0x7fffffff)
    */
    #ifndef HTTP_MAX_HEADER_SIZE
    #define HTTP_MAX_HEADER_SIZE (80 * 1024)
    #endif

    typedef struct http_parser http_parser;
    typedef struct http_parser_settings http_parser_settings;

    /* Callbacks should return non-zero to indicate an error. The parser will

  • then halt execution.

  • The one exception is on_headers_complete. In a HTTP_RESPONSE parser

  • returning '1' from on_headers_complete will tell the parser that it

  • should not expect a body. This is used when receiving a response to a

  • HEAD request which may contain 'Content-Length' or 'Transfer-Encoding:

  • chunked' headers that indicate the presence of a body.

  • Returning 2 from on_headers_complete will tell parser that it should not

  • expect neither a body nor any futher responses on this connection. This is

  • useful for handling responses to a CONNECT request which may not contain

  • Upgrade or Connection: upgrade headers.

  • http_data_cb does not return data chunks. It will be called arbitrarily

  • many times for each string. E.G. you might get 10 callbacks for "on_url"

  • each providing just a few characters more data.
    */
    typedef int (*http_data_cb)(http_parser *, const char *at, size_t length);
    typedef int (*http_cb)(http_parser *);

// 三个值分别为: 状态码 响应报文中对应的字符串 解释型描述
/* Status Codes */
#define HTTP_STATUS_MAP(XX)
XX(100, CONTINUE, Continue)
XX(101, SWITCHING_PROTOCOLS, Switching Protocols)
XX(102, PROCESSING, Processing)
XX(200, OK, OK)
XX(201, CREATED, Created)
XX(202, ACCEPTED, Accepted)
XX(203, NON_AUTHORITATIVE_INFORMATION, Non - Authoritative Information)
XX(204, NO_CONTENT, No Content)
XX(205, RESET_CONTENT, Reset Content)
XX(206, PARTIAL_CONTENT, Partial Content)
XX(207, MULTI_STATUS, Multi - Status)
XX(208, ALREADY_REPORTED, Already Reported)
XX(226, IM_USED, IM Used)
XX(300, MULTIPLE_CHOICES, Multiple Choices)
XX(301, MOVED_PERMANENTLY, Moved Permanently)
XX(302, FOUND, Found)
XX(303, SEE_OTHER, See Other)
XX(304, NOT_MODIFIED, Not Modified)
XX(305, USE_PROXY, Use Proxy)
XX(307, TEMPORARY_REDIRECT, Temporary Redirect)
XX(308, PERMANENT_REDIRECT, Permanent Redirect)
XX(400, BAD_REQUEST, Bad Request)
XX(401, UNAUTHORIZED, Unauthorized)
XX(402, PAYMENT_REQUIRED, Payment Required)
XX(403, FORBIDDEN, Forbidden)
XX(404, NOT_FOUND, Not Found)
XX(405, METHOD_NOT_ALLOWED, Method Not Allowed)
XX(406, NOT_ACCEPTABLE, Not Acceptable)
XX(407, PROXY_AUTHENTICATION_REQUIRED, Proxy Authentication Required)
XX(408, REQUEST_TIMEOUT, Request Timeout)
XX(409, CONFLICT, Conflict)
XX(410, GONE, Gone)
XX(411, LENGTH_REQUIRED, Length Required)
XX(412, PRECONDITION_FAILED, Precondition Failed)
XX(413, PAYLOAD_TOO_LARGE, Payload Too Large)
XX(414, URI_TOO_LONG, URI Too Long)
XX(415, UNSUPPORTED_MEDIA_TYPE, Unsupported Media Type)
XX(416, RANGE_NOT_SATISFIABLE, Range Not Satisfiable)
XX(417, EXPECTATION_FAILED, Expectation Failed)
XX(421, MISDIRECTED_REQUEST, Misdirected Request)
XX(422, UNPROCESSABLE_ENTITY, Unprocessable Entity)
XX(423, LOCKED, Locked)
XX(424, FAILED_DEPENDENCY, Failed Dependency)
XX(426, UPGRADE_REQUIRED, Upgrade Required)
XX(428, PRECONDITION_REQUIRED, Precondition Required)
XX(429, TOO_MANY_REQUESTS, Too Many Requests)
XX(431, REQUEST_HEADER_FIELDS_TOO_LARGE, Request Header Fields Too Large)
XX(451, UNAVAILABLE_FOR_LEGAL_REASONS, Unavailable For Legal Reasons)
XX(500, INTERNAL_SERVER_ERROR, Internal Server Error)
XX(501, NOT_IMPLEMENTED, Not Implemented)
XX(502, BAD_GATEWAY, Bad Gateway)
XX(503, SERVICE_UNAVAILABLE, Service Unavailable)
XX(504, GATEWAY_TIMEOUT, Gateway Timeout)
XX(505, HTTP_VERSION_NOT_SUPPORTED, HTTP Version Not Supported)
XX(506, VARIANT_ALSO_NEGOTIATES, Variant Also Negotiates)
XX(507, INSUFFICIENT_STORAGE, Insufficient Storage)
XX(508, LOOP_DETECTED, Loop Detected)
XX(510, NOT_EXTENDED, Not Extended)
XX(511, NETWORK_AUTHENTICATION_REQUIRED, Network Authentication Required)

// 下面的声明会进行2次 define 语句的替换:
/**
 *  第一次替换:HTTP_STATUS_MAP(XX) 得到
 *  enum http_status {
 *  #define XX(num, name, string) HTTP_STATUS_##name = num,
	    XX(100, CONTINUE, Continue)                                               \
        XX(101, SWITCHING_PROTOCOLS, Switching Protocols)                         \
        XX(102, PROCESSING, Processing)                                           \
        XX(200, OK, OK)                                                           \
        ...
    #undef XX 
 * }
 * 第二次替换:XX(num, name, string) 得到
 * enum http_status {
 *      HTTP_STATUS_CONTINUE = 100,
 *      HTTP_STATUS_SWITCHING_PROTOCOLS = 101,
 *      HTTP_STATUS_SWITCHING_PROCESSING = 102,
 *      HTTP_STATUS_OK = 200,
 *      ...
 * }
 * 
 * 得到了全部status状态码的枚举
 */

enum http_status
{

#define XX(num, name, string) HTTP_STATUS_##name = num,
HTTP_STATUS_MAP(XX)
#undef XX
};

/* Request Methods /
#define HTTP_METHOD_MAP(XX)
XX(0, DELETE, DELETE)
XX(1, GET, GET)
XX(2, HEAD, HEAD)
XX(3, POST, POST)
XX(4, PUT, PUT)
/
pathological /
XX(5, CONNECT, CONNECT)
XX(6, OPTIONS, OPTIONS)
XX(7, TRACE, TRACE)
/
WebDAV /
XX(8, COPY, COPY)
XX(9, LOCK, LOCK)
XX(10, MKCOL, MKCOL)
XX(11, MOVE, MOVE)
XX(12, PROPFIND, PROPFIND)
XX(13, PROPPATCH, PROPPATCH)
XX(14, SEARCH, SEARCH)
XX(15, UNLOCK, UNLOCK)
XX(16, BIND, BIND)
XX(17, REBIND, REBIND)
XX(18, UNBIND, UNBIND)
XX(19, ACL, ACL)
/
subversion /
XX(20, REPORT, REPORT)
XX(21, MKACTIVITY, MKACTIVITY)
XX(22, CHECKOUT, CHECKOUT)
XX(23, MERGE, MERGE)
/
upnp /
XX(24, MSEARCH, M - SEARCH)
XX(25, NOTIFY, NOTIFY)
XX(26, SUBSCRIBE, SUBSCRIBE)
XX(27, UNSUBSCRIBE, UNSUBSCRIBE)
/
RFC-5789 /
XX(28, PATCH, PATCH)
XX(29, PURGE, PURGE)
/
CalDAV /
XX(30, MKCALENDAR, MKCALENDAR)
/
RFC-2068, section 19.6.1.2 /
XX(31, LINK, LINK)
XX(32, UNLINK, UNLINK)
/
icecast */
XX(33, SOURCE, SOURCE)

// 原理同上:
/**
 *  enum http_method {
 *       HTTP_DELETE = 0,
 *       HTTP_GET = 1,
 *       ...   
 * }
 * 
 * 得到了请求方法METHOD的枚举
 */

enum http_method
{

#define XX(num, name, string) HTTP_##name = num,
HTTP_METHOD_MAP(XX)
#undef XX
};

// 解析的类型 请求还是响应 2者都是?
enum http_parser_type
{
	HTTP_REQUEST,
	HTTP_RESPONSE,
	HTTP_BOTH
};

// 这个请求解析过程得到的一些状态 用位来标识
/* Flag values for http_parser.flags field */
enum flags
{
    // chunked传输
	F_CHUNKED = 1 &lt;&lt; 0,
    // 保持连接
	F_CONNECTION_KEEP_ALIVE = 1 &lt;&lt; 1,
    // 关闭连接
	F_CONNECTION_CLOSE = 1 &lt;&lt; 2,
    // 升级服务
	F_CONNECTION_UPGRADE = 1 &lt;&lt; 3,
    // chunked读取完毕
	F_TRAILING = 1 &lt;&lt; 4,
    // 协议升级
	F_UPGRADE = 1 &lt;&lt; 5,
    // 跳过body体
	F_SKIPBODY = 1 &lt;&lt; 6,
    // 报文数据段的有长度值
	F_CONTENTLENGTH = 1 &lt;&lt; 7
};

/* Map for errno-related constants
*

  • The provided argument should be a macro that takes 2 arguments.
    /
    #define HTTP_ERRNO_MAP(XX)
    /
    No error /
    XX(OK, "success")

    /
    Callback-related errors /
    XX(CB_message_begin, "the on_message_begin callback failed")
    XX(CB_url, "the on_url callback failed")
    XX(CB_header_field, "the on_header_field callback failed")
    XX(CB_header_value, "the on_header_value callback failed")
    XX(CB_headers_complete, "the on_headers_complete callback failed")
    XX(CB_body, "the on_body callback failed")
    XX(CB_message_complete, "the on_message_complete callback failed")
    XX(CB_status, "the on_status callback failed")
    XX(CB_chunk_header, "the on_chunk_header callback failed")
    XX(CB_chunk_complete, "the on_chunk_complete callback failed")

    /
    Parsing-related errors */
    XX(INVALID_EOF_STATE, "stream ended at an unexpected time")
    XX(HEADER_OVERFLOW,
    "too many header bytes seen; overflow detected")
    XX(CLOSED_CONNECTION,
    "data received after completed connection: close message")
    XX(INVALID_VERSION, "invalid HTTP version")
    XX(INVALID_STATUS, "invalid HTTP status code")
    XX(INVALID_METHOD, "invalid HTTP method")
    XX(INVALID_URL, "invalid URL")
    XX(INVALID_HOST, "invalid host")
    XX(INVALID_PORT, "invalid port")
    XX(INVALID_PATH, "invalid path")
    XX(INVALID_QUERY_STRING, "invalid query string")
    XX(INVALID_FRAGMENT, "invalid fragment")
    XX(LF_EXPECTED, "LF character expected")
    XX(INVALID_HEADER_TOKEN, "invalid character in header")
    XX(INVALID_CONTENT_LENGTH,
    "invalid character in content-length header")
    XX(UNEXPECTED_CONTENT_LENGTH,
    "unexpected content-length header")
    XX(INVALID_CHUNK_SIZE,
    "invalid character in chunk size header")
    XX(INVALID_CONSTANT, "invalid constant string")
    XX(INVALID_INTERNAL_STATE, "encountered unexpected internal state")
    XX(STRICT, "strict mode assertion failed")
    XX(PAUSED, "parser is paused")
    XX(UNKNOWN, "an unknown error occurred")
    XX(INVALID_TRANSFER_ENCODING,
    "request has invalid transfer-encoding")

// 原理同上:
/**

  • enum http_errno
    {
    HPE_OK,
    HPE_CB_message_begin,
    HPE_CB_url,
    ...
    };

    得到错误类型枚举
    */

/* Define HPE_* values for each errno value above */
#define HTTP_ERRNO_GEN(n, s) HPE_##n,
enum http_errno
{
HTTP_ERRNO_MAP(HTTP_ERRNO_GEN)
};
#undef HTTP_ERRNO_GEN

/* Get an http_errno value from an http_parser */
#define HTTP_PARSER_ERRNO(p) ((enum http_errno)(p)->http_errno)

// 每个tcp连接处理一次报文的时候初始化一个新的http_parser结构体来存储解析的信息和状态
struct http_parser
{
	/** PRIVATE **/
    // 2位bit来区分解析报文的类型
	unsigned int type : 2;					 /* enum http_parser_type */
    // 8位bit来存储上文所述的flags各个字段
	unsigned int flags : 8;					 /* F_* values from &#39;flags&#39; enum; semi-public */
    // 7位bit来表示目前报文状态机的当前状态
	unsigned int state : 7;					 /* enum state from http_parser.c */
    // 7位bit来表示目前正在解析头部字段所述的头部状态机的状态
	unsigned int header_state : 7;			 /* enum header_state from http_parser.c */
    // 5位ibit 解析每一类数据 如 method url version head 等都把它重置为0 表示解析到当前类型数据的第几个字符
	unsigned int index : 5;					 /* index into current matcher */
    // 1位bit 是否有这个transfer-encoding字段
	unsigned int uses_transfer_encoding : 1; /* Transfer-Encoding header is present */
    // 1位bit 是否同时有length和chunked
	unsigned int allow_chunked_length : 1;	 /* Allow headers with both
                                      * `Content-Length` and
                                      * `Transfer-Encoding: chunked` set */
    // 1位bit 放宽对头部字段字符集的限制
	unsigned int lenient_http_headers : 1;

    // paser当前总共分析了多少个字符
	uint32_t nread;			 /* # bytes read in various scenarios */
    // 如果有content_length字段 存储它的值
	uint64_t content_length; /* # bytes in body. `(uint64_t) -1` (all bits one)
                        * if no Content-Length header.
                        */

	/** READ-ONLY **/
    // HTTP主版本
	unsigned short http_major;
    // HTTP次版本
	unsigned short http_minor;
    // 16位状态码
	unsigned int status_code : 16; /* responses only */
    // 8位请求方法
	unsigned int method : 8;	   /* requests only */
    // parser所处的errno状态 只有OK才算正常errno
	unsigned int http_errno : 7;

	/* 1 = Upgrade header was present and the parser has exited because of that.
  • 0 = No upgrade header present.

  • Should be checked when http_parser_execute() returns in addition to

  • error checking.
    */
    // 协议升级
    unsigned int upgrade : 1;

    // 与外界数据产生关联的钩子
    /** PUBLIC **/
    void data; / A pointer to get hook to the "connection" or "socket" object */
    };

// 存储用户传入的回调函数的地址setting
struct http_parser_settings
{ 
    // 开始解析时触发
	http_cb on_message_begin;
    // 解析出完整的请求url时触发
	http_data_cb on_url;
    // 响应报文解析出 status时触发
	http_data_cb on_status;
    // 解析出一个头部字段key时触发
	http_data_cb on_header_field;
    // 解析出一个头部字段值value时触发
	http_data_cb on_header_value;
    // 整个头部字段(0-N)行解析完成触发
	http_cb on_headers_complete;
    // 报文数据读读取完时触发
	http_data_cb on_body;
    // 整个解析完成时触发
	http_cb on_message_complete;
	/* When on_chunk_header is called, the current chunk length is stored
  • in parser->content_length.
    */
    // 解析得到一行chunked的size大小 值放在此时的content-length中 触发
    http_cb on_chunk_header;
    // 对应上面size的chunked数据读取完触发
    http_cb on_chunk_complete;
    };
enum http_parser_url_fields
{
    // 协议
	UF_SCHEMA = 0,
    // host 域名或者ip地址
	UF_HOST = 1,
    // 端口号
	UF_PORT = 2,
    // 请求路径path
	UF_PATH = 3,
    // 查询参数
	UF_QUERY = 4,
    // 哈希字段
	UF_FRAGMENT = 5,
    // 用户信息
	UF_USERINFO = 6,
    // 最大位数
	UF_MAX = 7
};

/* Result structure for http_parser_parse_url().
  • Callers should index into field_data[] with UF_* values iff field_set

  • has the relevant (1 << UF_*) bit set. As a courtesy to clients (and

  • because we probably have padding left over), we convert any port to

  • a uint16_t.
    /
    struct http_parser_url
    {
    // url哪些部分存在 存在则置位1
    uint16_t field_set; /
    Bitmask of (1 << UF_*) values /
    // 端口号
    uint16_t port; /
    Converted UF_PORT string */

     // 每个数据字段的偏移和长度
     struct
     {
     	uint16_t off; /* Offset into buffer in which field starts */
     	uint16_t len; /* Length of run in buffer */
     } field_data[UF_MAX];
    

    };

    /* Returns the library version. Bits 16-23 contain the major version number,

  • bits 8-15 the minor version number and bits 0-7 the patch level.

  • Usage example:

  • unsigned long version = http_parser_version();

  • unsigned major = (version >> 16) & 255;

  • unsigned minor = (version >> 8) & 255;

  • unsigned patch = version & 255;

  • printf("http_parser v%u.%u.%u\n", major, minor, patch);
    */
    unsigned long http_parser_version(void);

void http_parser_init(http_parser *parser, enum http_parser_type type);

/* Initialize http_parser_settings members to 0

*/
void http_parser_settings_init(http_parser_settings *settings);

/* Executes the parser. Returns number of parsed bytes. Sets
  • parser-&gt;http_errno on error. */
    size_t http_parser_execute(http_parser *parser,
    const http_parser_settings *settings,
    const char *data,
    size_t len);

    /* If http_should_keep_alive() in the on_headers_complete or

  • on_message_complete callback returns 0, then this should be

  • the last message on the connection.

  • If you are the server, respond with the "Connection: close" header.

  • If you are the client, close the connection.
    */
    int http_should_keep_alive(const http_parser *parser);

    /* Returns a string version of the HTTP method. */
    const char *http_method_str(enum http_method m);

    /* Returns a string version of the HTTP status code. */
    const char *http_status_str(enum http_status s);

    /* Return a string name of the given error */
    const char *http_errno_name(enum http_errno err);

    /* Return a string description of the given error */
    const char *http_errno_description(enum http_errno err);

    /* Initialize all http_parser_url members to 0 */
    void http_parser_url_init(struct http_parser_url *u);

    /* Parse a URL; return nonzero on failure */
    int http_parser_parse_url(const char *buf, size_t buflen,
    int is_connect,
    struct http_parser_url *u);

    /* Pause or un-pause the parser; a nonzero value pauses */
    void http_parser_pause(http_parser *parser, int paused);

    /* Checks if this is the final chunk of the body. */
    int http_body_is_final(const http_parser *parser);

    /* Change the maximum header size provided at compile time. */
    void http_parser_set_max_header_size(uint32_t size);

#ifdef __cplusplus
}
#endif
#endif

http_parser.c由于字符数超标,文件地址为: http_parser.c

里面包含了全部内容,大家请自行查阅哈。