good morning!!!!

Skip to content
Snippets Groups Projects
Unverified Commit 9848028a authored by Igor Mandrigin's avatar Igor Mandrigin Committed by GitHub
Browse files

Implement a simple healthcheck (#2740)

parent d027d712
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,7 @@
- [Getting Started](#getting-started)
* [Running locally](#running-locally)
* [Running remotely](#running-remotely)
* [Healthcheck](#healthcheck)
* [Testing](#testing)
- [FAQ](#faq)
* [Relations between prune options and rpc methods](#relations-between-prune-options-and-rpc-method)
......@@ -63,6 +64,44 @@ The daemon should respond with something like:
INFO [date-time] HTTP endpoint opened url=localhost:8545...
```
### Healthcheck
Running the daemon also opens an endpoint `/health` that provides a basic
health check.
If the health check is successful it returns 200 OK.
If the health check fails it returns 500 Internal Server Error.
Configuration of the health check is sent as POST body of the method.
```
{
"min_peer_count": <minimal number of the node peers>,
"known_block": <number_of_block_that_node_should_know>
}
```
Not adding a check disables that.
**`min_peer_count`** -- checks for mimimum of healthy node peers. Requires
`net` namespace to be listed in `http.api`.
**`known_block`** -- sets up the block that node has to know about. Requires
`eth` namespace to be listed in `http.api`.
Example request
```http POST http://localhost:8545/health --raw '{"min_peer_count": 3, "known_block": "0x1F"}'```
Example response
```
{
"check_block": "HEALTHY",
"healthcheck_query": "HEALTHY",
"min_peer_count": "HEALTHY"
}
```
### Testing
By default, the `rpcdaemon` serves data from `localhost:8545`. You may send `curl` commands to see if things are
......
......@@ -15,6 +15,7 @@ import (
kv2 "github.com/ledgerwatch/erigon-lib/kv/mdbx"
"github.com/ledgerwatch/erigon-lib/kv/remotedb"
"github.com/ledgerwatch/erigon-lib/kv/remotedbserver"
"github.com/ledgerwatch/erigon/cmd/rpcdaemon/health"
"github.com/ledgerwatch/erigon/cmd/rpcdaemon/services"
"github.com/ledgerwatch/erigon/cmd/utils"
"github.com/ledgerwatch/erigon/common/paths"
......@@ -252,6 +253,10 @@ func StartRpcServer(ctx context.Context, cfg Flags, rpcAPI []rpc.API) error {
}
var handler http.Handler = http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// adding a healthcheck here
if health.ProcessHealthcheckIfNeeded(w, r, rpcAPI) {
return
}
if cfg.WebsocketEnabled && r.Method == "GET" {
wsHandler.ServeHTTP(w, r)
return
......
package health
import (
"context"
"fmt"
"github.com/ledgerwatch/erigon/rpc"
)
func checkBlockNumber(blockNumber rpc.BlockNumber, api EthAPI) error {
if api == nil {
return fmt.Errorf("no connection to the Erigon server or `eth` namespace isn't enabled")
}
data, err := api.GetBlockByNumber(context.TODO(), blockNumber, false)
if err != nil {
return err
}
if len(data) == 0 { // block not found
return fmt.Errorf("no known block with number %v (%x hex)", blockNumber, blockNumber)
}
return nil
}
package health
import (
"context"
"fmt"
)
func checkMinPeers(minPeerCount uint, api NetAPI) error {
if api == nil {
return fmt.Errorf("no connection to the Erigon server or `net` namespace isn't enabled")
}
peerCount, err := api.PeerCount(context.TODO())
if err != nil {
return err
}
if uint64(peerCount) < uint64(minPeerCount) {
return fmt.Errorf("not enough peers: %d (minimum %d))", peerCount, minPeerCount)
}
return nil
}
package health
import (
"encoding/json"
"errors"
"fmt"
"io"
"io/ioutil"
"net/http"
"strings"
"github.com/ledgerwatch/erigon/rpc"
"github.com/ledgerwatch/log/v3"
)
type requestBody struct {
MinPeerCount *uint `json:"min_peer_count"`
BlockNumber *rpc.BlockNumber `json:"known_block"`
}
const (
urlPath = "/health"
)
var (
errCheckDisabled = errors.New("error check disabled")
)
func ProcessHealthcheckIfNeeded(
w http.ResponseWriter,
r *http.Request,
rpcAPI []rpc.API,
) bool {
if !strings.EqualFold(r.URL.Path, urlPath) {
return false
}
netAPI, ethAPI := parseAPI(rpcAPI)
var errMinPeerCount = errCheckDisabled
var errCheckBlock = errCheckDisabled
body, errParse := parseHealthCheckBody(r.Body)
defer r.Body.Close()
if errParse != nil {
log.Root().Warn("unable to process healthcheck request", "error", errParse)
} else {
// 1. net_peerCount
if body.MinPeerCount != nil {
errMinPeerCount = checkMinPeers(*body.MinPeerCount, netAPI)
}
// 2. custom query (shouldn't fail)
if body.BlockNumber != nil {
errCheckBlock = checkBlockNumber(*body.BlockNumber, ethAPI)
}
// TODO add time from the last sync cycle
}
err := reportHealth(errParse, errMinPeerCount, errCheckBlock, w)
if err != nil {
log.Root().Warn("unable to process healthcheck request", "error", err)
}
return true
}
func parseHealthCheckBody(reader io.Reader) (requestBody, error) {
var body requestBody
bodyBytes, err := ioutil.ReadAll(reader)
if err != nil {
return body, err
}
err = json.Unmarshal(bodyBytes, &body)
if err != nil {
return body, err
}
return body, nil
}
func reportHealth(errParse, errMinPeerCount, errCheckBlock error, w http.ResponseWriter) error {
statusCode := http.StatusOK
errors := make(map[string]string)
if shouldChangeStatusCode(errParse) {
statusCode = http.StatusInternalServerError
}
errors["healthcheck_query"] = errorStringOrOK(errParse)
if shouldChangeStatusCode(errMinPeerCount) {
statusCode = http.StatusInternalServerError
}
errors["min_peer_count"] = errorStringOrOK(errMinPeerCount)
if shouldChangeStatusCode(errCheckBlock) {
statusCode = http.StatusInternalServerError
}
errors["check_block"] = errorStringOrOK(errCheckBlock)
w.WriteHeader(statusCode)
bodyJson, err := json.Marshal(errors)
if err != nil {
return err
}
_, err = w.Write(bodyJson)
if err != nil {
return err
}
return nil
}
func shouldChangeStatusCode(err error) bool {
return err != nil && !errors.Is(err, errCheckDisabled)
}
func errorStringOrOK(err error) string {
if err == nil {
return "HEALTHY"
}
if errors.Is(err, errCheckDisabled) {
return "DISABLED"
}
return fmt.Sprintf("ERROR: %v", err)
}
package health
import (
"context"
"github.com/ledgerwatch/erigon/common/hexutil"
"github.com/ledgerwatch/erigon/rpc"
)
type NetAPI interface {
PeerCount(_ context.Context) (hexutil.Uint, error)
}
type EthAPI interface {
GetBlockByNumber(_ context.Context, number rpc.BlockNumber, fullTx bool) (map[string]interface{}, error)
}
package health
import (
"github.com/ledgerwatch/erigon/rpc"
)
func parseAPI(api []rpc.API) (netAPI NetAPI, ethAPI EthAPI) {
for _, rpc := range api {
if rpc.Service == nil {
continue
}
if netCandidate, ok := rpc.Service.(NetAPI); ok {
netAPI = netCandidate
}
if ethCandidate, ok := rpc.Service.(EthAPI); ok {
ethAPI = ethCandidate
}
}
return netAPI, ethAPI
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment