将图灵机器人api接入nao机器人 // Eric's Blog

用nao自带的choregraphe新建项目，结构如下图所示：

简单来说，一共分为4步

启动nao自带的录音程序，拍打nao头部启动录音，并输出录音文件所在的路径。
因图灵机器人暂时只支持纯文本输入，需要将录音转换成文字。
将转换出来的文字调用图灵机器人api，输出回答。
根据不同回答配合不同肢体动作。

设置nao录音程序

启动choregraphe，在左下指令盒库搜索Record Sound,如下图：

输入和输出均有4种基本类型：动态，激活，数字，字符串
具体类型区别详见官方文档。
单击进入Record Sound修改Rec. Sound File如下

import time
class MyClass(GeneratedClass):
    def __init__(self):
        GeneratedClass.__init__(self, False)
        try:
            self.ad = ALProxy("ALAudioDevice")
        except Exception as e:
            self.ad = None
            self.logger.error(e)
        self.leds = ALProxy("ALLeds")
        self.player = ALProxy('ALAudioPlayer')
        self.filepath = ""
    def onLoad(self):
        self.bIsRecording = False
        self.bIsRunning = False
    def onUnload(self):
        self.bIsRunning = False
        #用ftp把音频上传到'/home/nao/recordings/ringstones/'目录
        if( self.bIsRecording ):
            self.player.post.playFileFromPosition('/home/nao/recordings/ringstones/drip.wav', 0.00, 1.00, 0.00)
            self.ad.stopMicrophonesRecording()
            self.bIsRecording = False
    def onInput_onStart(self, p):
        if(self.bIsRunning):
            return
        self.bIsRunning = True
        sGroup = "FaceLeds"
        RGB = [0, 255, 51]
        sExtension = self.toExtension( self.getParameter("Microphones used") )
        self.filepath = p + sExtension
        if self.ad:
            self.leds.fadeRGB(sGroup, 256*256*RGB[0] + 256*RGB[1] + RGB[2], 0.5)
            time.sleep(0.2)
            self.ad.startMicrophonesRecording( self.filepath )
            self.bIsRecording = True
        else:
            self.logger.warning("No sound recorded")
    def onInput_onStop(self):
        if( self.bIsRunning ):
            self.onUnload()
            self.onStopped(self.filepath)
    def toExtension(self, sMicrophones):
        if( sMicrophones == "Front head microphone only (.ogg)" ):
            return ".ogg"
        else:
            return ".wav"

主要增加了录音开始以及nao机器人眼睛灯的提示。最后输出录音文件所在的位置。

对录音文件的处理

直接看代码吧

import urllib2
import json
import base64
import httplib
class MyClass(GeneratedClass):
    def __init__(self):
        GeneratedClass.__init__(self)
    def onLoad(self):
        #put initialization code here
        pass
    def onUnload(self):
        #put clean-up code here
        pass
    #下面的p就是录音文件所在的位置
    def onInput_onStart(self, p):
        #self.onStopped() #activate the output of the box
        accessToken = self.getAccessToken()
        speech_file = str(p)
        self.logger.info(p)
        try:
            cmd = self.baidu_asr(speech_file, accessToken)
            self.logger.info(cmd)
        except Exception, e:
            cmd = ''
        self.logger.info(cmd)
        res = self.request_tuling(cmd)
        self.logger.info(res)
        try:
            response_dic = json.loads(res, encoding='utf-8')
            answer = response_dic['text'].encode('utf-8', 'ignore')
        except Exception,e:
            self.logger.error(e)
            answer = '对不起,没能听清你的话呢'
        try:
            response_dic = json.loads(res, encoding='utf-8')
            special_value1 = response_dic['special_value1']
        except Exception,e:
            self.logger.error(e)
            special_value1 = '0'
        try:
            response_dic = json.loads(res, encoding='utf-8')
            special_value2 = response_dic['special_value2'].encode('utf-8', 'ignore')
        except Exception,e:
            self.logger.error(e)
            special_value2 = ''
        self.output([answer, special_value1, special_value2])
    def onInput_onStop(self):
        self.onUnload() #it is recommended to reuse the clean-up as the box is stopped
        self.onStopped() #activate the output of the box
    def getAccessToken(self):
        ApiKey = 'your_baidu_speech_apikey'
        SecretKey = 'your_secret_key'
        auth_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=" +   ApiKey + "&client_secret=" + SecretKey
        res = urllib2.urlopen(auth_url)
        json_data = res.read()
        return json.loads(json_data)['access_token']
    #请求百度语音识别接口，输入音频文件路径，输出语音识别出来的文本
    def baidu_asr(self, speech_file, access_token):
        asr_server = 'http://vop.baidu.com/server_api'
        with open(speech_file, 'rb') as f:
            speech_data = f.read()
            speech_base64=base64.b64encode(speech_data).decode('utf-8')
            speech_length=len(speech_data)
            data_dict = {'format':'wav', 'rate':16000, 'channel':1, 'cuid':'005056c00008',   'token':access_token, 'lan':'zh', 'speech':speech_base64, 'len':speech_length}
            json_data = json.dumps(data_dict)
            json_length = len(json_data)
            request = urllib2.Request(url=asr_server)
            request.add_header("Content-Type", "application/json")
            request.add_header("Content-Length", json_length)
            fs = urllib2.urlopen(url=request, data=json_data)
            result_str = fs.read()
            json_resp = json.loads(result_str, encoding='utf-8')
        self.logger.info('result_str ' + result_str)
        return json_resp['result'][0].encode('utf-8', 'ignore')
    #请求图灵接口
    def request_tuling(self, cmd):
        if not cmd:
            return ''
        requrl = '/openapi/api?key=your_tuling_apikey&info=%s' % cmd
        try:
            conn = httplib.HTTPConnection("www.tuling123.com", timeout=5)
            conn.request(method="GET", url=requrl)
            response = conn.getresponse()
            res = response.read()
            conn.close()
        except Exception,e:
            self.logger.error(e)
            res = ''
        return res

主要分为两步：

请求百度语音识别接口获取识别后的文字
将识别后的文字请求图灵接口，获取机器人回答

以上代码包含对运动控制逻辑的处理，涉及隐私，故略去

根据不同回答不同处理

此处主要包含对运动指令（前后左右，跳舞）以及普通对话处理
简单来说包含运动指令关键词的执行运动控制的指令盒，即上面的controlBox，纯文本对话的执行SayWithBehavior指令盒，控制逻辑主要由decision控制。
涉及运动控制的涉及隐私故略去。下面主要说纯文本输出部分。即SayWithBehavior指令盒
结构如下图：

随机动作指令盒包含不同时长的动作信息，具体可以看nao的官方文档中时间轴部分的介绍。
最后回复初始状态（站立）

遗留问题

每次需要拍打头部才能启动，没研究懂怎么根据声音强度，检测到说话自动调起录音程序
录音结束时间目前写死了。
反应比较慢，包括nao录音，语音识别过程，中间有失败没有迅速反馈，实际中会造成用户反复拍打进行识别。