记一次Android项目网络优化

1,背景

去年年末换了新公司,发现公司App的网络响应速度十分之慢,一个页面十秒的等待时间甚至超时都是常态,一方面的原因是服务器部署在国外,另一方面App内部可能存在问题。

2,诊断

利用okhttp中的EventListener接口,可以方便的监听每次网络请求的生命周期。

代码如下:

open class HttpEventListener public constructor(
  callId: Long,
  url: HttpUrl,
  private val callStartNanos: Long
) : EventListener() {

  private var client = ""

  private val df = SimpleDateFormat("yyyy-MM-dd HH:mm:ss::SSS", Locale.US)

  private val sbLog: StringBuilder = StringBuilder(url.toString())
      .append(" callId is:$callId").append(":\n")

  private val netLog: NetLogRealmObj = NetLogRealmObj().apply {
    startTime = df.format(Date())
    api = url.toString()
    this.callId = callId.toString()
  }

  private fun recordEventLog(httpEvent: String) {
    val elapseNanos = System.nanoTime() - callStartNanos
    val usedTimeNano = elapseNanos / 1000000000.0
    val time = String.format(Locale.US, "%.3f", usedTimeNano)
    val timeAndEvent = "$time-$httpEvent"
    sbLog
        .append(timeAndEvent)
        .append(";\n")
    if (httpCallIsEndOrFailed(httpEvent)) {
      Log.i("HttpEventListener", sbLog.toString())
      totalTimeUsed += usedTimeNano
      Log.i("HttpEventListener", "TotalTimeUsed : $totalTimeUsed")
    }
  }

  private fun httpCallIsEndOrFailed(httpEvent: String) =
      httpEvent == "callEnd" || httpEvent.contains("callFailed")

  override fun callStart(call: Call) {
    super.callStart(call)
    recordEventLog("callStart:$client:::${call.request()}")
    netLog.method = call.request().method()
  }

  override fun dnsStart(call: Call, domainName: String) {
    super.dnsStart(call, domainName)
    recordEventLog("dnsStart:$domainName")
  }

  override fun dnsEnd(call: Call, domainName: String, inetAddressList: List<InetAddress>) {
    super.dnsEnd(call, domainName, inetAddressList)
    if (inetAddressList.isNotEmpty()) {
      recordEventLog("dnsEnd:$inetAddressList")
      inetAddressList.forEach {
        netLog.ip += it.toString()
      }
    } else {
      recordEventLog("dnsEnd:No IP get!!!")
    }
  }

  override fun connectStart(call: Call, inetSocketAddress: InetSocketAddress, proxy: Proxy) {
    super.connectStart(call, inetSocketAddress, proxy)
    recordEventLog(
        "connectStart: InetSocketAddress is: $inetSocketAddress Proxy is $proxy"
    )
  }

  override fun secureConnectStart(call: Call) {
    super.secureConnectStart(call)
    recordEventLog("secureConnectStart: ${call.request()}")
  }

  override fun secureConnectEnd(call: Call, handshake: Handshake?) {
    super.secureConnectEnd(call, handshake)
    recordEventLog(
        "secureConnectEnd: tlsVersion is :${handshake?.tlsVersion()
            ?: "unknown"}"
    )
  }

  override fun connectEnd(
    call: Call,
    inetSocketAddress: InetSocketAddress,
    proxy: Proxy,
    protocol: Protocol?
  ) {
    super.connectEnd(call, inetSocketAddress, proxy, protocol)
    recordEventLog("connectEnd: Ip is:$inetSocketAddress")
  }

  override fun connectFailed(
    call: Call,
    inetSocketAddress: InetSocketAddress,
    proxy: Proxy,
    protocol: Protocol?,
    ioe: IOException
  ) {
    super.connectFailed(call, inetSocketAddress, proxy, protocol, ioe)
    recordEventLog("connectFailed: $inetSocketAddress")
  }

  override fun connectionAcquired(call: Call, connection: Connection) {
    super.connectionAcquired(call, connection)
    recordEventLog("connectionAcquired: $connection")
  }

  override fun connectionReleased(call: Call, connection: Connection) {
    super.connectionReleased(call, connection)
    recordEventLog("connectionReleased:$connection")
  }

  override fun requestHeadersStart(call: Call) {
    super.requestHeadersStart(call)
    recordEventLog("requestHeadersStart")
  }

  override fun requestHeadersEnd(call: Call, request: Request) {
    super.requestHeadersEnd(call, request)
    recordEventLog("requestHeadersEnd")
  }

  override fun requestBodyStart(call: Call) {
    super.requestBodyStart(call)
    recordEventLog("requestBodyStart")
  }

  override fun requestBodyEnd(call: Call, byteCount: Long) {
    super.requestBodyEnd(call, byteCount)
    recordEventLog("requestBodyEnd: and the byteCount is $byteCount")
  }

  override fun responseHeadersStart(call: Call) {
    super.responseHeadersStart(call)
    recordEventLog("responseHeadersStart:${call.request()}")
  }

  override fun responseHeadersEnd(call: Call, response: Response) {
    super.responseHeadersEnd(call, response)
    recordEventLog("responseHeadersEnd:${call.request()}")
  }

  override fun requestFailed(call: Call, ioe: IOException) {
    super.requestFailed(call, ioe)
    recordEventLog("requestFailed:$ioe}")
  }

  override fun responseBodyStart(call: Call) {
    super.responseBodyStart(call)
    recordEventLog("responseBodyStart")
  }

  override fun responseBodyEnd(call: Call, byteCount: Long) {
    super.responseBodyEnd(call, byteCount)
    recordEventLog(
        "responseBodyEnd: and the responseBody byte count is ${byteCount / 1000.0}" +
            " KB"
    )
  }

  override fun responseFailed(call: Call, ioe: IOException) {
    super.responseFailed(call, ioe)
    recordEventLog("responseFailed:$ioe")
  }

  override fun callEnd(call: Call) {
    super.callEnd(call)
    recordEventLog("callEnd")
  }

  override fun callFailed(call: Call, ioe: IOException) {
    super.callFailed(call, ioe)
    recordEventLog("callFailed: $ioe")
  }

  companion object {
    private val nextCallId = AtomicLong(1L)

    private var totalTimeUsed = 0.toDouble()

    @JvmField
    val FACTORY: Factory = Factory { call ->
        val callId = nextCallId.getAndIncrement()
        HttpEventListener(callId, call.request().url(), System.nanoTime())
    }

    @JvmField
    val NoneFactory: Factory = Factory { NONE }
  }
}

得到每个网络请求的生命周期:

发现红框中的流程,DNS解析—TCP握手—SSL握手,每个网络请求都在不断的重复。

理论上同一个域名,第一次请求完毕之后,之后所有的请求都应该直接复用Socket连接。如下图所示:

这也正是HTTP1.1协议相对于HTTP1.0协议的关键改进所在,而不负责任的垃圾代码,直接将项目中的网络请求速度拉回到1995年。

可以参考《Http权威指南》中的对比图,有没有复用连接,时间差异巨大:

因此, 问题1:没有复用Socket连接。

问题2:网络流量没有压缩处理,在没有本地加密机制的情况下,GZIP对json的压缩效果在50%以上,这样8年的老项目一直没有人发现这一点也让我倍感意外。

问题2相对容易解决,只需要在Ngix端开启对json的压缩配置就可以修复。

3,找到Root Cause

3.1: 第一个问题:多个okhttpInstance,

首先是代码中很傻的声明了3个okhttpInstance,完全独立的对象之间自然不能共享线程池、连接池。

因此,

解决方法1,强制收敛为一个单例提供唯一instance,这样的话代码改动较大,会影响原来代码的test case;

解决方法2,声明一个baseOkhttpClient,然后把这三个okhttpInstance从这个baseOkhttpClientnewBuilder出来,这样的话这3个okhttpInstance就能共享连接池,线程池,内部对象。

方法签名如下:

  public Builder newBuilder() {
    return new Builder(this);
  }

可以看到共享的原理是把this传输给了这个new builder,所有如下的对象都是可以在多个okhttpInstance复用的:

    Builder(OkHttpClient okHttpClient) {
      this.dispatcher = okHttpClient.dispatcher;
      this.proxy = okHttpClient.proxy;
      this.protocols = okHttpClient.protocols;
      this.connectionSpecs = okHttpClient.connectionSpecs;
      this.interceptors.addAll(okHttpClient.interceptors);
      this.networkInterceptors.addAll(okHttpClient.networkInterceptors);
      this.eventListenerFactory = okHttpClient.eventListenerFactory;
      this.proxySelector = okHttpClient.proxySelector;
      this.cookieJar = okHttpClient.cookieJar;
      this.internalCache = okHttpClient.internalCache;
      this.cache = okHttpClient.cache;
      this.socketFactory = okHttpClient.socketFactory;
      this.sslSocketFactory = okHttpClient.sslSocketFactory;
      this.certificateChainCleaner = okHttpClient.certificateChainCleaner;
      this.hostnameVerifier = okHttpClient.hostnameVerifier;
      this.certificatePinner = okHttpClient.certificatePinner;
      this.proxyAuthenticator = okHttpClient.proxyAuthenticator;
      this.authenticator = okHttpClient.authenticator;
      this.connectionPool = okHttpClient.connectionPool;
      this.dns = okHttpClient.dns;
      this.followSslRedirects = okHttpClient.followSslRedirects;
      this.followRedirects = okHttpClient.followRedirects;
      this.retryOnConnectionFailure = okHttpClient.retryOnConnectionFailure;
      this.callTimeout = okHttpClient.callTimeout;
      this.connectTimeout = okHttpClient.connectTimeout;
      this.readTimeout = okHttpClient.readTimeout;
      this.writeTimeout = okHttpClient.writeTimeout;
      this.pingInterval = okHttpClient.pingInterval;
    }

至此第一个问题得到解决。

3.2: 第二个问题:sslSocketFactory对象每次都不同,

第一个问题解决后,发现Socket连接不能复用问题仍然存在。

为了找到根本原因,直接找到okhttp中负责判断Socket连接能否复用的源码,在RealConnection类中:

  /**
   * Returns true if this connection can carry a stream allocation to `address`. If non-null
   * `route` is the resolved route for a connection.
   */
  internal fun isEligible(address: Address, routes: List<Route>?): Boolean {
    assertThreadHoldsLock()

    // If this connection is not accepting new exchanges, we're done.
    if (calls.size >= allocationLimit || noNewExchanges) return false

    // If the non-host fields of the address don't overlap, we're done.
    if (!this.route.address.equalsNonHost(address)) return false

    // If the host exactly matches, we're done: this connection can carry the address.
    if (address.url.host == this.route().address.url.host) {
      return true // This connection is a perfect match.
    }

    // At this point we don't have a hostname match. But we still be able to carry the request if
    // our connection coalescing requirements are met. See also:
    // https://hpbn.co/optimizing-application-delivery/#eliminate-domain-sharding
    // https://daniel.haxx.se/blog/2016/08/18/http2-connection-coalescing/

    // 1. This connection must be HTTP/2.
    if (http2Connection == null) return false

    // 2. The routes must share an IP address.
    if (routes == null || !routeMatchesAny(routes)) return false

    // 3. This connection's server certificate's must cover the new host.
    if (address.hostnameVerifier !== OkHostnameVerifier) return false
    if (!supportsUrl(address.url)) return false

    // 4. Certificate pinning must match the host.
    try {
      address.certificatePinner!!.check(address.url.host, handshake()!!.peerCertificates)
    } catch (_: SSLPeerUnverifiedException) {
      return false
    }

    return true // The caller's address can be carried by this connection.
  }

直接打断点,然后开始网络请求,哪里return false,哪里就是Root Cause。

if (!this.route.address.equalsNonHost(address)) return false这一行return,深入看看AddressequalsNonHost的实现细节:

  internal fun equalsNonHost(that: Address): Boolean {
    return this.dns == that.dns &&
        this.proxyAuthenticator == that.proxyAuthenticator &&
        this.protocols == that.protocols &&
        this.connectionSpecs == that.connectionSpecs &&
        this.proxySelector == that.proxySelector &&
        this.proxy == that.proxy &&
        this.sslSocketFactory == that.sslSocketFactory &&
        this.hostnameVerifier == that.hostnameVerifier &&
        this.certificatePinner == that.certificatePinner &&
        this.url.port == that.url.port
  }

基本是比较了Address的所有属性,继续单步断点,发现是在this.sslSocketFactory == that.sslSocketFactory这个条件不满足,因此就诊断出了根本原因:sslSocketFactory的配置问题,每次网络请求的时候都是不同的sslSocketFactory对象。

原来的代码在实现证书锁定的时候,采用的是传统的sslSocketFactory, 这种方式比较繁琐,且项目中存在N多的域名需要pinning,因此果断采用okhttpCertificatePinner, 虽然这个类的名字是证书绑定,实际上的实现却是公钥固定,okhttp的作者仍然这么命名,大概是因为至少在安全性的角度,这两者没有任何区别。如果公司有证书Rotate的机制,那么通常公钥固定更加有优势,因为一般而言,变化的是证书的有效期,而公钥是不会变的。然而,本公司的证书Rotate的机制是公钥跟着一起变,因此,这一点没区别。

简单声明:

CertificatePinner certPinner = new CertificatePinner.Builder()
        .add(xx1.domain1.com",
              "sha256/4hw5tz+scE+TW+dasda+nU7tq1V8=")
        .add(“xx2.domain1.com",
              "sha256/4hw5tz+scE+TW+3432423+nU7tq1V8=")
        .add(xx.domain2.com",
              "sha256/4hw5tz+scE+TW+iojjo+nU7tq1V8=")
        .build();
 
OkHttpClient okHttpClient = new OkHttpClient.Builder()
        .certificatePinner(certPinner)
        .build();

删除掉原来的sslSocketFactory代码,第二个问题得以解决。

4,优化效果:

同样的手机,网络,操作路径,把所有的网络耗时加起来,对比之前的耗时,可以得到如下数据,优化掉的效果大概是54.38%:

这是还没有开启Gzip的情况下,否则数据会更加夸张。

  Title Test1 Test2 Test3 Test4 Test5 Test6 avg net-time cost(seconds) total net-request nums avg cost time per request(seconds)
New Net time cost:(seconds) 519.7 423.6 505.7 416.7 419.1 399.6 447.4 110 4.07
Old Net time cost:(seconds) 807.4 813.3 1510.6 1316.2 698.1 738.4 980.7 110 8.92
Optimize Ration: (Old-New)/Old:                 54.38%

5, 一个感叹

感觉项目已经搞不好了。