将图灵机器人api接入nao机器人

用nao自带的choregraphe新建项目,结构如下图所示:
enter image description here
简单来说,一共分为4步

  1. 启动nao自带的录音程序,拍打nao头部启动录音,并输出录音文件所在的路径。
  2. 因图灵机器人暂时只支持纯文本输入,需要将录音转换成文字。
  3. 将转换出来的文字调用图灵机器人api,输出回答。
  4. 根据不同回答配合不同肢体动作。

设置nao录音程序

启动choregraphe,在左下指令盒库搜索Record Sound,如下图:
enter image description here
输入输出均有4种基本类型:动态激活数字字符串
具体类型区别详见官方文档。
单击进入Record Sound修改Rec. Sound File如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import time
class MyClass(GeneratedClass):
def __init__(self):
GeneratedClass.__init__(self, False)
try:
self.ad = ALProxy("ALAudioDevice")
except Exception as e:
self.ad = None
self.logger.error(e)
self.leds = ALProxy("ALLeds")
self.player = ALProxy('ALAudioPlayer')
self.filepath = ""
def onLoad(self):
self.bIsRecording = False
self.bIsRunning = False
def onUnload(self):
self.bIsRunning = False
#用ftp把音频上传到'/home/nao/recordings/ringstones/'目录
if( self.bIsRecording ):
self.player.post.playFileFromPosition('/home/nao/recordings/ringstones/drip.wav', 0.00, 1.00, 0.00)
self.ad.stopMicrophonesRecording()
self.bIsRecording = False
def onInput_onStart(self, p):
if(self.bIsRunning):
return
self.bIsRunning = True
sGroup = "FaceLeds"
RGB = [0, 255, 51]
sExtension = self.toExtension( self.getParameter("Microphones used") )
self.filepath = p + sExtension
if self.ad:
self.leds.fadeRGB(sGroup, 256*256*RGB[0] + 256*RGB[1] + RGB[2], 0.5)
time.sleep(0.2)
self.ad.startMicrophonesRecording( self.filepath )
self.bIsRecording = True
else:
self.logger.warning("No sound recorded")
def onInput_onStop(self):
if( self.bIsRunning ):
self.onUnload()
self.onStopped(self.filepath)
def toExtension(self, sMicrophones):
if( sMicrophones == "Front head microphone only (.ogg)" ):
return ".ogg"
else:
return ".wav"

主要增加了录音开始以及nao机器人眼睛灯的提示。最后输出录音文件所在的位置。

对录音文件的处理

直接看代码吧

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import urllib2
import json
import base64
import httplib
class MyClass(GeneratedClass):
def __init__(self):
GeneratedClass.__init__(self)
def onLoad(self):
#put initialization code here
pass
def onUnload(self):
#put clean-up code here
pass
#下面的p就是录音文件所在的位置
def onInput_onStart(self, p):
#self.onStopped() #activate the output of the box
accessToken = self.getAccessToken()
speech_file = str(p)
self.logger.info(p)
try:
cmd = self.baidu_asr(speech_file, accessToken)
self.logger.info(cmd)
except Exception, e:
cmd = ''
self.logger.info(cmd)
res = self.request_tuling(cmd)
self.logger.info(res)
try:
response_dic = json.loads(res, encoding='utf-8')
answer = response_dic['text'].encode('utf-8', 'ignore')
except Exception,e:
self.logger.error(e)
answer = '对不起,没能听清你的话呢'
try:
response_dic = json.loads(res, encoding='utf-8')
special_value1 = response_dic['special_value1']
except Exception,e:
self.logger.error(e)
special_value1 = '0'
try:
response_dic = json.loads(res, encoding='utf-8')
special_value2 = response_dic['special_value2'].encode('utf-8', 'ignore')
except Exception,e:
self.logger.error(e)
special_value2 = ''
self.output([answer, special_value1, special_value2])
def onInput_onStop(self):
self.onUnload() #it is recommended to reuse the clean-up as the box is stopped
self.onStopped() #activate the output of the box
def getAccessToken(self):
ApiKey = 'your_baidu_speech_apikey'
SecretKey = 'your_secret_key'
auth_url = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=" + ApiKey + "&client_secret=" + SecretKey
res = urllib2.urlopen(auth_url)
json_data = res.read()
return json.loads(json_data)['access_token']
#请求百度语音识别接口,输入音频文件路径,输出语音识别出来的文本
def baidu_asr(self, speech_file, access_token):
asr_server = 'http://vop.baidu.com/server_api'
with open(speech_file, 'rb') as f:
speech_data = f.read()
speech_base64=base64.b64encode(speech_data).decode('utf-8')
speech_length=len(speech_data)
data_dict = {'format':'wav', 'rate':16000, 'channel':1, 'cuid':'005056c00008', 'token':access_token, 'lan':'zh', 'speech':speech_base64, 'len':speech_length}
json_data = json.dumps(data_dict)
json_length = len(json_data)
request = urllib2.Request(url=asr_server)
request.add_header("Content-Type", "application/json")
request.add_header("Content-Length", json_length)
fs = urllib2.urlopen(url=request, data=json_data)
result_str = fs.read()
json_resp = json.loads(result_str, encoding='utf-8')
self.logger.info('result_str ' + result_str)
return json_resp['result'][0].encode('utf-8', 'ignore')
#请求图灵接口
def request_tuling(self, cmd):
if not cmd:
return ''
requrl = '/openapi/api?key=your_tuling_apikey&info=%s' % cmd
try:
conn = httplib.HTTPConnection("www.tuling123.com", timeout=5)
conn.request(method="GET", url=requrl)
response = conn.getresponse()
res = response.read()
conn.close()
except Exception,e:
self.logger.error(e)
res = ''
return res

主要分为两步:

  1. 请求百度语音识别接口获取识别后的文字
  2. 将识别后的文字请求图灵接口,获取机器人回答

    以上代码包含对运动控制逻辑的处理,涉及隐私,故略去

根据不同回答不同处理

此处主要包含对运动指令(前后左右,跳舞)以及普通对话处理
简单来说包含运动指令关键词的执行运动控制的指令盒,即上面的controlBox,纯文本对话的执行SayWithBehavior指令盒,控制逻辑主要由decision控制。
涉及运动控制的涉及隐私故略去。下面主要说纯文本输出部分。即SayWithBehavior指令盒
结构如下图:
enter image description here
随机动作指令盒包含不同时长的动作信息,具体可以看nao的官方文档中时间轴部分的介绍。
最后回复初始状态(站立

遗留问题

  1. 每次需要拍打头部才能启动,没研究懂怎么根据声音强度,检测到说话自动调起录音程序
  2. 录音结束时间目前写死了。
  3. 反应比较慢,包括nao录音,语音识别过程,中间有失败没有迅速反馈,实际中会造成用户反复拍打进行识别。