thrift 序列化字段读写的一个小坑

前段时间跟同事一块联调某系统时，client发送thrift序列化后的数据，本地打log能正确读到该字段，而server却收不到该字段的值，感到比较诡异，通过修改下读写的方法就ok了，花了一点时间踩了一个小坑，先从业务代码片段，再到源码分析，最后再与protobuf作对应点的简单比较，与大家分享下。

先看下thrift IDL在序列化读写时的用法，以某业务中简化版为例，定义及两种读写方式如下：

struct UrlItem {
	1: required string url;
	2: optional string referer;
}

//第一种用法
UrlItem url_item;
url_item.url = "http://47.110.236.62/";
url_item.referer = "https://www.google.com.hk/";

log(LOG_NOTICE, "url:%s\treferer:%s\n", 
	url_item.url.c_str(), 
	url_item.referer.c_str());

//第二种用法
url_item.__set_url("http://47.110.236.62/");
url_item.__set_referer("https://www.google.com.hk/");

log(LOG_NOTICE, "url:%s\treferer:%s\n", 
	url_item.url.c_str(), 
	url_item.referer.c_str());

以上省略掉网络发送的过程，从log上看，两种用法本地log均符合预期，但是第一种用法server端却没有收到url_item对应的referer数据，第二种用法却正常，UrlItem实际是一个成员均为public的类，对public成员直接点调用与thrift生成的set方法调用看来是有差异，具体在哪呢？让我们从源码来看：

typedef struct _UrlItem__isset {
  _UrlItem__isset() : referer(false) {}
  bool referer;
} _UrlItem__isset;

class UrlItem {
 public:

  static const char* ascii_fingerprint; // = "5B708A954C550ECA9C1A49D3C5CAFAB9";
  static const uint8_t binary_fingerprint[16]; // = {0x5B,0x70,0x8A,0x95,0x4C,0x55,0x0E,0xCA,0x9C,0x1A,0x49,0xD3,0xC5,0xCA,0xFA,0xB9};

  UrlItem() : url(""), referer("") {
  }

  virtual ~UrlItem() throw() {}

  std::string url;
  std::string referer;

  _UrlItem__isset __isset;

  void __set_url(const std::string& val) {
    url = val;
  }

  void __set_referer(const std::string& val) {
    referer = val;
    __isset.referer = true;
  }

从上面thirft对IDL生成的代码可以看到，用了一个额外的结构体_UrlItem__isset用来标记optional成员的写（赋值）状态，使用__set_XX方法会对__isset对应字段置true，直接点成员赋值却没有该作用。那这样对网络发送有神马作用呢？让我们继续看序列化读写源码：

uint32_t UrlItem::read(::apache::thrift::protocol::TProtocol* iprot) {

  uint32_t xfer = 0;
  std::string fname;
  ::apache::thrift::protocol::TType ftype;
  int16_t fid;

  xfer += iprot->readStructBegin(fname);

  using ::apache::thrift::protocol::TProtocolException;

  bool isset_url = false;

  while (true)
  {
    xfer += iprot->readFieldBegin(fname, ftype, fid);
    if (ftype == ::apache::thrift::protocol::T_STOP) {
      break;
    }
    switch (fid)
    {
      case 1:
        if (ftype == ::apache::thrift::protocol::T_STRING) {
          xfer += iprot->readString(this->url);
          isset_url = true;
        } else {
          xfer += iprot->skip(ftype);
        }
        break;
      case 2:
        if (ftype == ::apache::thrift::protocol::T_STRING) {
          xfer += iprot->readString(this->referer);
          this->__isset.referer = true;
        } else {
          xfer += iprot->skip(ftype);
        }
        break;
      default:
        xfer += iprot->skip(ftype);
        break;
    }
    xfer += iprot->readFieldEnd();
  }

  xfer += iprot->readStructEnd();

  if (!isset_url)
    throw TProtocolException(TProtocolException::INVALID_DATA);
  return xfer;
}

uint32_t UrlItem::write(::apache::thrift::protocol::TProtocol* oprot) const {
  uint32_t xfer = 0;
  xfer += oprot->writeStructBegin("UrlItem");
  xfer += oprot->writeFieldBegin("url", ::apache::thrift::protocol::T_STRING, 1);
  xfer += oprot->writeString(this->url);
  xfer += oprot->writeFieldEnd();
  if (this->__isset.referer) {
    xfer += oprot->writeFieldBegin("referer", ::apache::thrift::protocol::T_STRING, 2);
    xfer += oprot->writeString(this->referer);
    xfer += oprot->writeFieldEnd();
  }
  xfer += oprot->writeFieldStop();
  xfer += oprot->writeStructEnd();
  return xfer;
}

显然，序列化写操作会有对optional成员的写标记进行判断，当通过网络对外发送时会调用write方法，当__isset对应成员没有置写标记为true时该字段也就不会对外发送了，而对于required成员必选字段则不会有此问题，哼，问题就在这里，这样做有神马好处呢？这样能减少网络发送的开销，对于未使用的optional可选字段就没有必要发送了，thrift IDL的实现这里使用结构体对于每个可选成员都用bool变量来标记，这里还是相对比较浪费的，一个字段一个字节呢；而对于反序列化进行读操作会通过字段类型和字段ID去判断，无需担心。因此，当写optional字段时一定要采用__set_XX成员方法的方式！
google的protobuf又是怎么处理的呢？笔者把相关代码片断放一块了：

message UrlItem {
	required string url = 1;
	optional string referer = 2;
}

class UrlItem : public ::google::protobuf::Message {
 public:
  UrlItem();
  virtual ~UrlItem();
  
  UrlItem(const UrlItem& from);

  // required string url = 1;
  inline bool has_url() const;
  inline void clear_url();
  static const int kUrlFieldNumber = 1;
  inline const ::std::string& url() const;
  inline void set_url(const ::std::string& value);
  inline void set_url(const char* value);
  inline void set_url(const char* value, size_t size);
  inline ::std::string* mutable_url();
  inline ::std::string* release_url();
  
  // optional string referer = 2;
  inline bool has_referer() const;
  inline void clear_referer();
  static const int kRefererFieldNumber = 2;
  inline const ::std::string& referer() const;
  inline void set_referer(const ::std::string& value);
  inline void set_referer(const char* value);
  inline void set_referer(const char* value, size_t size);
  inline ::std::string* mutable_referer();
  inline ::std::string* release_referer();

 private:
  inline void set_has_url();
  inline void clear_has_url();
  inline void set_has_referer();
  inline void clear_has_referer();
  
  ::google::protobuf::UnknownFieldSet _unknown_fields_;
  
  ::std::string* url_;
  ::std::string* referer_;
  
  mutable int _cached_size_;
  ::google::protobuf::uint32 _has_bits_[(2 + 31) / 32];
};

// required string url = 1;
inline bool UrlItem::has_url() const {
  return (_has_bits_[0] & 0x00000001u) != 0;
}
inline void UrlItem::set_has_url() {
  _has_bits_[0] |= 0x00000001u;
}
inline void UrlItem::clear_has_url() {
  _has_bits_[0] &= ~0x00000001u;
}
inline void UrlItem::clear_url() {
  if (url_ != &::google::protobuf::internal::kEmptyString) {
    url_->clear();
  }
  clear_has_url();
}
inline const ::std::string& UrlItem::url() const {
  return *url_;
}
inline void UrlItem::set_url(const ::std::string& value) {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  url_->assign(value);
}
inline void UrlItem::set_url(const char* value) {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  url_->assign(value);
}
inline void UrlItem::set_url(const char* value, size_t size) {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  url_->assign(reinterpret_cast(value), size);
}
inline ::std::string* UrlItem::mutable_url() {
  set_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    url_ = new ::std::string;
  }
  return url_;
}
inline ::std::string* UrlItem::release_url() {
  clear_has_url();
  if (url_ == &::google::protobuf::internal::kEmptyString) {
    return NULL;
  } else {
    ::std::string* temp = url_;
    url_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
    return temp;
  }
}

// optional string referer = 2;
inline bool UrlItem::has_referer() const {
  return (_has_bits_[0] & 0x00000002u) != 0;
}
inline void UrlItem::set_has_referer() {
  _has_bits_[0] |= 0x00000002u;
}
inline void UrlItem::clear_has_referer() {
  _has_bits_[0] &= ~0x00000002u;
}
inline void UrlItem::clear_referer() {
  if (referer_ != &::google::protobuf::internal::kEmptyString) {
    referer_->clear();
  }
  clear_has_referer();
}
inline const ::std::string& UrlItem::referer() const {
  return *referer_;
}
inline void UrlItem::set_referer(const ::std::string& value) {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  referer_->assign(value);
}
inline void UrlItem::set_referer(const char* value) {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  referer_->assign(value);
}
inline void UrlItem::set_referer(const char* value, size_t size) {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  referer_->assign(reinterpret_cast(value), size);
}
inline ::std::string* UrlItem::mutable_referer() {
  set_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    referer_ = new ::std::string;
  }
  return referer_;
}
inline ::std::string* UrlItem::release_referer() {
  clear_has_referer();
  if (referer_ == &::google::protobuf::internal::kEmptyString) {
    return NULL;
  } else {
    ::std::string* temp = referer_;
    referer_ = const_cast< ::std::string*>(&::google::protobuf::internal::kEmptyString);
    return temp;
  }
}

google protobuf的实现这里用到标志位比特_has_bits_，相对thrift省却不少空间哦，而且使用了varint可变字节压缩编码，当数据值相对较小时是很节省的（虽然thrift也是用了基于可变字节编码的zigzag，对于负数值更省空间），再者protobuf序列化反序列化等实现上也更高效(比较下与thrift的代码就可看出)，如果我有选型的决定权，肯定推崇google protobuf啦~

PandaDemo

The New Beginning

thrift 序列化字段读写的一个小坑

《thrift 序列化字段读写的一个小坑》有1个想法

发表回复取消回复

《thrift 序列化字段读写的一个小坑》有1个想法

发表回复 取消回复

发表回复取消回复